In [1]:
# !pip install transformers==4.46.2 datasets==3.1.0 numpy==1.26.4 sklearn-pandas==2.2.0 torch==2.5.1+cu121

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
# from sklearn.metrics import f1_score, precision_score, recall_score
import pandas as pd
import numpy as np
import datasets
import time
import wandb
import random
# import os
# from fvcore.nn import FlopCountAnalysis
# from torch.utils.data import DataLoader
import torch

key_file = r'C:\Development\TactitalTensorsFinalProject\WANDB_API_KEY.txt' 

with open(key_file, "r") as f:
    api_key = f.read().strip()

# Log into WandB with the API key
wandb.login(key=api_key)

# Initialize WandB (no need to manually set the WANDB_API_KEY env variable again)


  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mayoungren94[0m ([33mayoungren-colostate[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\ayoun\_netrc


True

The WANDB_API_KEY is nessesary to use the Trainer class from HuggingFace and is what's used in the example notebook. However, I think we may all have to put in our own api key secret into google colab for it to work.

In [2]:
# from google.colab import userdata
# os.environ["WANDB_API_KEY"] = userdata.get('WANDB_API_KEY')


In [3]:
langs = ['java', 'python', 'pharo']

# langs = ['java'] # Using Java as the only language for testing purposes.
# langs = ['python']
# langs = ['pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}
ds = datasets.load_dataset('NLBSE/nlbse25-code-comment-classification')

In [4]:
ds

DatasetDict({
    java_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 7614
    })
    java_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1725
    })
    python_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1884
    })
    python_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 406
    })
    pharo_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1298
    })
    pharo_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 289
    })
})

In [5]:
ds['java_test'][0]

{'index': 5,
 'class': 'AbstractContractGetFileStatusTest.java',
 'comment_sentence': 'accept everything.',
 'partition': 1,
 'combo': 'accept everything. | AbstractContractGetFileStatusTest.java',
 'labels': [0, 0, 1, 0, 0, 0, 0]}

In [None]:
model_name = 'distilbert/distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

The next few functions are to preprocess the trainng and validation sets, and to let the Trainer class to evaluate how well the trainng is going after each epoch.

In [7]:
# To tokenize the text in the 'combo' column of the training dataset of each language.

def tokenize_dataset(examples):
  return tokenizer(examples['combo'], truncation=True, padding="max_length", max_length=128)

In [8]:
# Tokenizes the text 'combo' column of the dataset, changes the values of the
# labels column to float instead of int, and sets to format of the lables column
# to torch tesnsors, which is required by the Trainer class.

def preprocess_dataset(input_dataset):
  processed_dataset = input_dataset.map(tokenize_dataset, batched=True, load_from_cache_file=False)
  processed_dataset = processed_dataset.cast_column("labels", datasets.features.Sequence(datasets.features.Value("float32")))
  processed_dataset.set_format('pt')
  return processed_dataset

Evaluating the predictions.

Below is non-functional code I still haven't finnished. We need to implement a way for the logits returned from the model to be converted into binary (0 - 1) predictions for the labels, and for those labels to be compaired to the ground truth labels of the testing datasets for each language. The competition also wants us to complete this step using profiling that they have set up, and which is partially shown below. The link to the reference notebook for the competition is here. Also, another notebook that is fairly close to what I believe we are trying to do can be found here_for_multi_label_text_classification.ipynb).

In [None]:
# scores must be global so that it can later be referenced for printing the results
scores = pd.DataFrame(columns=['comparison', 'model', 'lan', 'cat', 'precision', 'recall', 'f1'])

def compute_metrics(eval_pred, lang, categories, mod_name, comp):
    # global scores
    # scores.drop(scores[scores['lan'] == lang].index, inplace=True)

    predictions, true_labels = eval_pred
    predictions = (predictions > 0.5).astype(int)
    num_classes = len(categories)

    metrics_list = []

    for i in range(num_classes):
        y_pred = predictions[:, i]
        y_true = true_labels[:, i]

        tp = np.sum((y_true == 1) & (y_pred == 1))
        fp = np.sum((y_true == 0) & (y_pred == 1))
        fn = np.sum((y_true == 1) & (y_pred == 0))

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

        metrics_list.append({
            'model': mod_name,
            'lan': lang,
            'comparison': comp,
            'cat': categories[i],
            'precision': precision,
            'recall': recall,
            'f1': f1
        })

    temp_scores = pd.DataFrame([m for m in metrics_list if m['model'] == mod_name])
    scores = pd.concat([scores, temp_scores], ignore_index=True)

    avg_f1 = temp_scores['f1'].mean()
    avg_precision = temp_scores['precision'].mean()
    avg_recall = temp_scores['recall'].mean()

    return {'f1': avg_f1, 'precision': avg_precision, 'recall': avg_recall}

In [10]:
def measure_runtime_and_flops(trainer, dataset):
    flops = 0
    runtime = 0

    for i in range(10):
        for batch in dataset:
            inputs = {key: val.unsqueeze(0).to(trainer.model.device) for key, val in batch.items() if key in ['input_ids', 'attention_mask']}
            with torch.profiler.profile(with_flops=True, activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA]) as prof:
                start_time = time.time()
                _ = trainer.model(**inputs)
                end_time = time.time()

            runtime += (end_time - start_time)
            flops += sum(k.flops for k in prof.key_averages()) / 1e9 # Convert Flops to GFLOPs

    avg_runtime = runtime / 10
    avg_flops = flops / 10
    
    return avg_runtime, avg_flops, flops, runtime

In [None]:
def compute_metrics_wrapper(mod_name, comp):
    def inner(eval_pred):
        return compute_metrics(eval_pred, lang, labels[lang], mod_name, comp)
    return inner

In [12]:
max_avg_runtime = 5
max_avg_flops = 5000

def score(avg_f1, avg_runtime, avg_flops):
    return (0.6 * avg_f1 +
            0.2 * ((max_avg_runtime - avg_runtime) / max_avg_runtime) +
            0.2 * ((max_avg_flops - avg_flops) / max_avg_flops))

In [13]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # for CUDA devices
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable to ensure deterministic operations

In [None]:
# Training the models for each language and saving the the models and thier
# tokenizers after training.

# trying different pre-trained models such as roberta-base, distilbert-base, or codebert-large

seed = 27
set_seed(seed)

learning_rates = [5e-4, 5e-5, 5e-6]
epochs = [5, 10, 15, 20]
batch_sizes = [4, 8, 16]
weight_of_decay = [0.01, 0.001]
gs_metrics_dict = {}
final_model_stats = {}

for lr in learning_rates:
  for wd in weight_of_decay:
    for epoch in epochs:
      for bs in batch_sizes:
        print(f'------------------ Starting model ==> epochs: {epoch}, batch size: {bs}, weights of decay: {wd} ---------------------')
        fin_mod_name = f'epoch-{epoch}_batchSize-{bs}_weightsOfDecay-{wd}'
        total_flops = 0
        total_time = 0
        total_avg_runtime = 0
        total_avg_flops = 0

        for lang in langs:

          lang_mod_name = f'{lang}_epoch-{epoch}_batchSize-{bs}_weightsOfDecay-{wd}'
          wandb.init(
              project="NBSE2025_GridSearch_3", 
              entity="ayoungren-colostate",
              name=lang_mod_name,
              config={  # Log hyperparameters for each run
                  "epochs": epoch,
                  "batch_size": bs,
                  "weight_decay": wd,
                  "language": lang
              }
          )
          
          num_labels = len(labels[lang])
          model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, problem_type="multi_label_classification")

          dataset = preprocess_dataset(ds[f'{lang}_train'])
          train_valalidation_split = dataset.train_test_split(test_size=0.2, seed=seed)

          train_dataset =  train_valalidation_split['train']
          validation_dataset = train_valalidation_split['test']

          training_args = TrainingArguments(
            output_dir=f'./results_{lang_mod_name}',
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_dir=f'./logs_{lang_mod_name}',
            per_device_train_batch_size=bs,
            per_device_eval_batch_size=bs,
            num_train_epochs=epoch,
            weight_decay=wd,
            learning_rate=lr,
            logging_steps=1000,
            save_total_limit=2,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            seed=seed
          )
          
          trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset= validation_dataset,
            processing_class=tokenizer,
            compute_metrics=compute_metrics_wrapper(fin_mod_name, 'validation'),
          )

          trainer.train()
          metrics = trainer.evaluate()

          print("Evaluation Metrics:", metrics)

          avg_runtime, avg_flops, total_flops, total_time = measure_runtime_and_flops(trainer, validation_dataset)

            # store each model and succession score by individual language
          lang_mod_scores = scores[(scores['lan'] == lang) & (scores['model'] == fin_mod_name)]
          print('CHECK SCORE', scores[(scores['lan'] == lang) & (scores['model'] == fin_mod_name)])
          lang_avg_f1 = lang_mod_scores['f1'].mean()

          sc = round(score(lang_avg_f1, avg_runtime, avg_flops), 2)
          gs_metrics_dict[lang_mod_name] = {
            'lang': lang,
            'model': fin_mod_name,
            'avg_runtime': avg_runtime,
            'avg_flops': avg_flops,
            'total_flops': total_flops,
            'total_time': total_time,
            'avg_f1': lang_avg_f1,
            'succ_score': sc
          }

          total_avg_runtime += avg_runtime
          total_avg_flops += avg_flops

          trainer.model.save_pretrained(f'./models/{lang_mod_name}')
          tokenizer.save_pretrained(f'./tokenizers/{lang_mod_name}')
          
        print(scores)
        # store combined average succession score of all languages trained on. (Average Combined score of Java, Python, and Pharo)
        model_stats = scores[scores['model'] == fin_mod_name]
        fin_avg_f1 = model_stats['f1'].mean()
        final_model_stats[fin_mod_name] = round(score(fin_avg_f1, total_avg_runtime, total_avg_flops), 2)

------------------ Starting model ==> epochs: 5, batch size: 4, weights of decay: 0.01 ---------------------


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 7614/7614 [00:00<00:00, 18059.01 examples/s]
 13%|█▎        | 1003/7615 [01:04<06:56, 15.89it/s]

{'loss': 0.161, 'grad_norm': 0.2385871857404709, 'learning_rate': 4.343401181877873e-05, 'epoch': 0.66}


  scores = pd.concat([scores, temp_scores], ignore_index=True)
                                                   
 20%|██        | 1523/7615 [01:42<06:15, 16.21it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.14922671020030975, 'eval_f1': 0.5111289893021813, 'eval_precision': 0.5001785203458973, 'eval_recall': 0.5258611714970002, 'eval_runtime': 5.5167, 'eval_samples_per_second': 276.073, 'eval_steps_per_second': 69.064, 'epoch': 1.0}


 26%|██▋       | 2001/7615 [02:14<06:29, 14.43it/s]  

{'loss': 0.1085, 'grad_norm': 3.469529628753662, 'learning_rate': 3.6868023637557454e-05, 'epoch': 1.31}


 39%|███▉      | 3003/7615 [03:18<04:41, 16.40it/s]

{'loss': 0.0902, 'grad_norm': 0.49206215143203735, 'learning_rate': 3.030203545633618e-05, 'epoch': 1.97}


                                                   
 40%|████      | 3046/7615 [03:27<04:52, 15.61it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.10075070708990097, 'eval_f1': 0.7719487782969091, 'eval_precision': 0.7931749220157622, 'eval_recall': 0.7558382139560148, 'eval_runtime': 5.6377, 'eval_samples_per_second': 270.144, 'eval_steps_per_second': 67.58, 'epoch': 2.0}


 53%|█████▎    | 4003/7615 [04:29<04:00, 14.99it/s]  

{'loss': 0.0655, 'grad_norm': 0.02431034855544567, 'learning_rate': 2.3736047275114905e-05, 'epoch': 2.63}


                                                   
 60%|██████    | 4569/7615 [05:12<03:05, 16.40it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.08405041694641113, 'eval_f1': 0.8358127250778112, 'eval_precision': 0.870556550891739, 'eval_recall': 0.8068165673784579, 'eval_runtime': 5.7584, 'eval_samples_per_second': 264.482, 'eval_steps_per_second': 66.164, 'epoch': 3.0}


 66%|██████▌   | 5001/7615 [05:41<02:53, 15.04it/s]

{'loss': 0.0556, 'grad_norm': 7.247128963470459, 'learning_rate': 1.717005909389363e-05, 'epoch': 3.28}


 79%|███████▉  | 6003/7615 [06:44<01:44, 15.43it/s]

{'loss': 0.0394, 'grad_norm': 7.164225101470947, 'learning_rate': 1.0604070912672358e-05, 'epoch': 3.94}


                                                   
 80%|████████  | 6092/7615 [06:56<01:43, 14.74it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.09550824016332626, 'eval_f1': 0.8653730458823646, 'eval_precision': 0.8902078912379128, 'eval_recall': 0.8455691903550682, 'eval_runtime': 5.7914, 'eval_samples_per_second': 262.976, 'eval_steps_per_second': 65.787, 'epoch': 4.0}


 92%|█████████▏| 7003/7615 [07:56<00:37, 16.21it/s]

{'loss': 0.0224, 'grad_norm': 0.011469465680420399, 'learning_rate': 4.038082731451084e-06, 'epoch': 4.6}


                                                   
100%|██████████| 7615/7615 [08:43<00:00, 15.15it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.10385257005691528, 'eval_f1': 0.8631344492283745, 'eval_precision': 0.8665393300177457, 'eval_recall': 0.8604973309201913, 'eval_runtime': 6.5307, 'eval_samples_per_second': 233.206, 'eval_steps_per_second': 58.34, 'epoch': 5.0}


100%|██████████| 7615/7615 [08:44<00:00, 14.51it/s]


{'train_runtime': 524.8676, 'train_samples_per_second': 58.024, 'train_steps_per_second': 14.508, 'train_loss': 0.0734163070147709, 'epoch': 5.0}


100%|██████████| 381/381 [00:05<00:00, 70.60it/s]



 ** Evaluating metrics for java test set. ** 

Evaluation Metrics: {'eval_loss': 0.09550824016332626, 'eval_f1': 0.8653730458823646, 'eval_precision': 0.8902078912379128, 'eval_recall': 0.8455691903550682, 'eval_runtime': 5.4204, 'eval_samples_per_second': 280.976, 'eval_steps_per_second': 70.29, 'epoch': 5.0}


0,1
eval/f1,▁▆▇███
eval/loss,█▃▁▂▃▂
eval/precision,▁▆████
eval/recall,▁▆▇███
eval/runtime,▂▂▃▃█▁
eval/samples_per_second,▇▆▆▅▁█
eval/steps_per_second,▇▆▆▅▁█
train/epoch,▁▂▂▃▃▄▅▅▆▆▇███
train/global_step,▁▂▂▃▃▄▅▅▆▆▇███
train/grad_norm,▁▄▁▁██▁

0,1
eval/f1,0.86537
eval/loss,0.09551
eval/precision,0.89021
eval/recall,0.84557
eval/runtime,5.4204
eval/samples_per_second,280.976
eval/steps_per_second,70.29
total_flos,2003351730144000.0
train/epoch,5.0
train/global_step,7615.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1884/1884 [00:00<00:00, 14272.37 examples/s]
 20%|██        | 377/1885 [00:23<01:33, 16.06it/s]
 20%|██        | 377/1885 [00:25<01:33, 16.06it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.32002392411231995, 'eval_f1': 0.45487126357392266, 'eval_precision': 0.5256509411995671, 'eval_recall': 0.40610616034028235, 'eval_runtime': 1.5693, 'eval_samples_per_second': 240.236, 'eval_steps_per_second': 60.537, 'epoch': 1.0}


 40%|███▉      | 753/1885 [00:50<01:08, 16.64it/s]
 40%|████      | 754/1885 [00:51<01:07, 16.64it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.3037950396537781, 'eval_f1': 0.6025853775986256, 'eval_precision': 0.8352263419926216, 'eval_recall': 0.5476087073718239, 'eval_runtime': 1.4298, 'eval_samples_per_second': 263.666, 'eval_steps_per_second': 66.441, 'epoch': 2.0}


 53%|█████▎    | 1003/1885 [01:09<00:53, 16.55it/s]

{'loss': 0.3197, 'grad_norm': 2.327369213104248, 'learning_rate': 2.347480106100796e-05, 'epoch': 2.65}


 60%|██████    | 1131/1885 [01:17<00:44, 16.93it/s]
 60%|██████    | 1131/1885 [01:18<00:44, 16.93it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.31088727712631226, 'eval_f1': 0.6742870672590543, 'eval_precision': 0.7897675342560253, 'eval_recall': 0.6266251580101815, 'eval_runtime': 1.5083, 'eval_samples_per_second': 249.95, 'eval_steps_per_second': 62.985, 'epoch': 3.0}


 80%|███████▉  | 1507/1885 [01:44<00:22, 16.73it/s]
 80%|████████  | 1508/1885 [01:45<00:22, 16.73it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.318920761346817, 'eval_f1': 0.7329016766284735, 'eval_precision': 0.7872615608360055, 'eval_recall': 0.6908167371369223, 'eval_runtime': 1.4027, 'eval_samples_per_second': 268.766, 'eval_steps_per_second': 67.726, 'epoch': 4.0}


100%|██████████| 1885/1885 [02:11<00:00, 15.52it/s]
100%|██████████| 1885/1885 [02:14<00:00, 15.52it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.3255635201931, 'eval_f1': 0.7463789212763221, 'eval_precision': 0.7807696266120787, 'eval_recall': 0.7164636120102036, 'eval_runtime': 1.4696, 'eval_samples_per_second': 256.537, 'eval_steps_per_second': 64.645, 'epoch': 5.0}


100%|██████████| 1885/1885 [02:16<00:00, 13.85it/s]


{'train_runtime': 136.1439, 'train_samples_per_second': 55.346, 'train_steps_per_second': 13.846, 'train_loss': 0.22197741197338156, 'epoch': 5.0}


100%|██████████| 95/95 [00:01<00:00, 53.91it/s]



 ** Evaluating metrics for python test set. ** 

Evaluation Metrics: {'eval_loss': 0.3255635201931, 'eval_f1': 0.7463789212763221, 'eval_precision': 0.7807696266120787, 'eval_recall': 0.7164636120102036, 'eval_runtime': 1.7939, 'eval_samples_per_second': 210.161, 'eval_steps_per_second': 52.958, 'epoch': 5.0}


0,1
eval/f1,▁▅▆███
eval/loss,▆▁▃▆██
eval/precision,▁█▇▇▇▇
eval/recall,▁▄▆▇██
eval/runtime,▄▁▃▁▂█
eval/samples_per_second,▅▇▆█▇▁
eval/steps_per_second,▅▇▆█▇▁
train/epoch,▁▃▄▅▆███
train/global_step,▁▃▄▅▆███
train/grad_norm,▁

0,1
eval/f1,0.74638
eval/loss,0.32556
eval/precision,0.78077
eval/recall,0.71646
eval/runtime,1.7939
eval/samples_per_second,210.161
eval/steps_per_second,52.958
total_flos,495648800866560.0
train/epoch,5.0
train/global_step,1885.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1298/1298 [00:00<00:00, 10491.80 examples/s]
 20%|█▉        | 259/1300 [00:16<01:07, 15.38it/s]
 20%|██        | 260/1300 [00:17<01:07, 15.38it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.29351019859313965, 'eval_f1': 0.26654862106263005, 'eval_precision': 0.31132295023272466, 'eval_recall': 0.23842179706692473, 'eval_runtime': 1.3661, 'eval_samples_per_second': 190.323, 'eval_steps_per_second': 47.581, 'epoch': 1.0}


 40%|███▉      | 519/1300 [00:35<00:46, 16.65it/s]
 40%|████      | 520/1300 [00:36<00:46, 16.65it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.23040412366390228, 'eval_f1': 0.4936457990720073, 'eval_precision': 0.5844664122518417, 'eval_recall': 0.43152060385095575, 'eval_runtime': 0.962, 'eval_samples_per_second': 270.278, 'eval_steps_per_second': 67.569, 'epoch': 2.0}


 60%|█████▉    | 779/1300 [00:55<00:32, 15.85it/s]
 60%|██████    | 780/1300 [00:57<00:32, 15.85it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.22708669304847717, 'eval_f1': 0.5198598617533984, 'eval_precision': 0.5720715558950854, 'eval_recall': 0.48063645073742467, 'eval_runtime': 1.1087, 'eval_samples_per_second': 234.507, 'eval_steps_per_second': 58.627, 'epoch': 3.0}


 77%|███████▋  | 1003/1300 [01:13<00:20, 14.77it/s]

{'loss': 0.2175, 'grad_norm': 0.4598521888256073, 'learning_rate': 1.153846153846154e-05, 'epoch': 3.85}


 80%|███████▉  | 1039/1300 [01:15<00:16, 15.38it/s]
 80%|████████  | 1040/1300 [01:16<00:16, 15.38it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.22135844826698303, 'eval_f1': 0.5526401694083359, 'eval_precision': 0.5990715527005851, 'eval_recall': 0.5199639488139229, 'eval_runtime': 1.2915, 'eval_samples_per_second': 201.312, 'eval_steps_per_second': 50.328, 'epoch': 4.0}


100%|█████████▉| 1299/1300 [01:35<00:00, 16.21it/s]
100%|██████████| 1300/1300 [01:37<00:00, 16.21it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.21894140541553497, 'eval_f1': 0.5957053673379221, 'eval_precision': 0.746988740682342, 'eval_recall': 0.5475291134936563, 'eval_runtime': 1.12, 'eval_samples_per_second': 232.147, 'eval_steps_per_second': 58.037, 'epoch': 5.0}


100%|██████████| 1300/1300 [01:39<00:00, 13.03it/s]


{'train_runtime': 99.7977, 'train_samples_per_second': 52.005, 'train_steps_per_second': 13.026, 'train_loss': 0.18813525273249698, 'epoch': 5.0}


100%|██████████| 65/65 [00:01<00:00, 50.56it/s]



 ** Evaluating metrics for pharo test set. ** 

Evaluation Metrics: {'eval_loss': 0.21894140541553497, 'eval_f1': 0.5957053673379221, 'eval_precision': 0.746988740682342, 'eval_recall': 0.5475291134936563, 'eval_runtime': 1.3215, 'eval_samples_per_second': 196.745, 'eval_steps_per_second': 49.186, 'epoch': 5.0}
    comparison                                    model     lan  \
0   validation  epoch-5_batchSize-4_weightsOfDecay-0.01    java   
1   validation  epoch-5_batchSize-4_weightsOfDecay-0.01    java   
2   validation  epoch-5_batchSize-4_weightsOfDecay-0.01    java   
3   validation  epoch-5_batchSize-4_weightsOfDecay-0.01    java   
4   validation  epoch-5_batchSize-4_weightsOfDecay-0.01    java   
5   validation  epoch-5_batchSize-4_weightsOfDecay-0.01    java   
6   validation  epoch-5_batchSize-4_weightsOfDecay-0.01    java   
7   validation  epoch-5_batchSize-4_weightsOfDecay-0.01  python   
8   validation  epoch-5_batchSize-4_weightsOfDecay-0.01  python   
9   validation  

0,1
eval/f1,▁▆▆▇██
eval/loss,█▂▂▁▁▁
eval/precision,▁▅▅▆██
eval/recall,▁▅▆▇██
eval/runtime,█▁▄▇▄▇
eval/samples_per_second,▁█▅▂▅▂
eval/steps_per_second,▁█▅▂▅▂
train/epoch,▁▃▅▆▆███
train/global_step,▁▃▅▆▆███
train/grad_norm,▁

0,1
eval/f1,0.59571
eval/loss,0.21894
eval/precision,0.74699
eval/recall,0.54753
eval/runtime,1.3215
eval/samples_per_second,196.745
eval/steps_per_second,49.186
total_flos,341401920192000.0
train/epoch,5.0
train/global_step,1300.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 7614/7614 [00:00<00:00, 20508.07 examples/s]
 20%|█▉        | 761/3810 [00:55<03:54, 13.00it/s]
 20%|██        | 762/3810 [00:59<03:54, 13.00it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.11081206053495407, 'eval_f1': 0.5607734116765417, 'eval_precision': 0.7469528851693247, 'eval_recall': 0.5481769681642766, 'eval_runtime': 4.0214, 'eval_samples_per_second': 378.727, 'eval_steps_per_second': 47.496, 'epoch': 1.0}


 26%|██▋       | 1001/3810 [01:19<03:37, 12.93it/s]

{'loss': 0.1332, 'grad_norm': 3.2732927799224854, 'learning_rate': 3.6876640419947505e-05, 'epoch': 1.31}


 40%|███▉      | 1523/3810 [01:57<02:51, 13.34it/s]
 40%|████      | 1524/3810 [02:01<02:51, 13.34it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.08294270187616348, 'eval_f1': 0.7437767496447479, 'eval_precision': 0.9134550045964017, 'eval_recall': 0.7029050417316478, 'eval_runtime': 3.8472, 'eval_samples_per_second': 395.877, 'eval_steps_per_second': 49.647, 'epoch': 2.0}


 53%|█████▎    | 2001/3810 [02:38<02:20, 12.85it/s]

{'loss': 0.0711, 'grad_norm': 0.028852000832557678, 'learning_rate': 2.3753280839895015e-05, 'epoch': 2.62}


 60%|█████▉    | 2285/3810 [02:59<01:53, 13.42it/s]
 60%|██████    | 2286/3810 [03:04<01:53, 13.42it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.07681600004434586, 'eval_f1': 0.8421422076096936, 'eval_precision': 0.873295042720631, 'eval_recall': 0.8149901800729638, 'eval_runtime': 4.1324, 'eval_samples_per_second': 368.548, 'eval_steps_per_second': 46.22, 'epoch': 3.0}


 79%|███████▉  | 3002/3810 [04:00<01:01, 13.07it/s]

{'loss': 0.0394, 'grad_norm': 0.12436270713806152, 'learning_rate': 1.062992125984252e-05, 'epoch': 3.94}


 80%|████████  | 3048/3810 [04:03<00:54, 14.02it/s]
 80%|████████  | 3048/3810 [04:07<00:54, 14.02it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.08419058471918106, 'eval_f1': 0.8689351968548402, 'eval_precision': 0.8899172233829553, 'eval_recall': 0.8520457895570871, 'eval_runtime': 3.9766, 'eval_samples_per_second': 382.992, 'eval_steps_per_second': 48.031, 'epoch': 4.0}


100%|██████████| 3810/3810 [05:06<00:00, 14.05it/s]
100%|██████████| 3810/3810 [05:12<00:00, 14.05it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.09168393909931183, 'eval_f1': 0.862942671379306, 'eval_precision': 0.8856982996026572, 'eval_recall': 0.8460739973864218, 'eval_runtime': 4.0422, 'eval_samples_per_second': 376.777, 'eval_steps_per_second': 47.252, 'epoch': 5.0}


100%|██████████| 3810/3810 [05:15<00:00, 12.08it/s]


{'train_runtime': 315.44, 'train_samples_per_second': 96.548, 'train_steps_per_second': 12.078, 'train_loss': 0.06831518308384212, 'epoch': 5.0}


100%|██████████| 191/191 [00:03<00:00, 49.62it/s]



 ** Evaluating metrics for java test set. ** 

Evaluation Metrics: {'eval_loss': 0.08419058471918106, 'eval_f1': 0.8689351968548402, 'eval_precision': 0.8899172233829553, 'eval_recall': 0.8520457895570871, 'eval_runtime': 3.9341, 'eval_samples_per_second': 387.123, 'eval_steps_per_second': 48.549, 'epoch': 5.0}


0,1
eval/f1,▁▅▇███
eval/loss,█▂▁▃▄▃
eval/precision,▁█▆▇▇▇
eval/recall,▁▅▇███
eval/runtime,▅▁█▄▆▃
eval/samples_per_second,▄█▁▅▃▆
eval/steps_per_second,▄█▁▅▃▆
train/epoch,▁▂▃▄▅▆▆███
train/global_step,▁▂▃▄▅▆▆███
train/grad_norm,█▁▁

0,1
eval/f1,0.86894
eval/loss,0.08419
eval/precision,0.88992
eval/recall,0.85205
eval/runtime,3.9341
eval/samples_per_second,387.123
eval/steps_per_second,48.549
total_flos,2003351730144000.0
train/epoch,5.0
train/global_step,3810.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1884/1884 [00:00<00:00, 14717.38 examples/s]
 20%|██        | 189/945 [00:13<00:52, 14.52it/s]
 20%|██        | 189/945 [00:14<00:52, 14.52it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.336627721786499, 'eval_f1': 0.4138625470390177, 'eval_precision': 0.5646315789473684, 'eval_recall': 0.3380812213937992, 'eval_runtime': 0.9391, 'eval_samples_per_second': 401.468, 'eval_steps_per_second': 51.115, 'epoch': 1.0}


 40%|███▉      | 377/945 [00:30<00:40, 13.95it/s]
 40%|████      | 378/945 [00:31<00:40, 13.95it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.3423939645290375, 'eval_f1': 0.5389728817653051, 'eval_precision': 0.6113871930538597, 'eval_recall': 0.5080514728201393, 'eval_runtime': 0.9475, 'eval_samples_per_second': 397.894, 'eval_steps_per_second': 50.66, 'epoch': 2.0}


 60%|██████    | 567/945 [00:46<00:27, 13.64it/s]
 60%|██████    | 567/945 [00:47<00:27, 13.64it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.2830806076526642, 'eval_f1': 0.5901947965055152, 'eval_precision': 0.662228604999268, 'eval_recall': 0.5474470383155207, 'eval_runtime': 1.0265, 'eval_samples_per_second': 367.282, 'eval_steps_per_second': 46.763, 'epoch': 3.0}


 80%|███████▉  | 755/945 [01:05<00:13, 13.92it/s]
 80%|████████  | 756/945 [01:06<00:13, 13.92it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.2725023329257965, 'eval_f1': 0.7193657729976148, 'eval_precision': 0.7992387087687133, 'eval_recall': 0.6633549959344535, 'eval_runtime': 0.951, 'eval_samples_per_second': 396.434, 'eval_steps_per_second': 50.474, 'epoch': 4.0}


100%|██████████| 945/945 [01:22<00:00, 14.45it/s]
100%|██████████| 945/945 [01:25<00:00, 14.45it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.2746121883392334, 'eval_f1': 0.7404721244540559, 'eval_precision': 0.7977589343280833, 'eval_recall': 0.6931875356130865, 'eval_runtime': 0.9575, 'eval_samples_per_second': 393.75, 'eval_steps_per_second': 50.133, 'epoch': 5.0}


100%|██████████| 945/945 [01:27<00:00, 10.84it/s]


{'train_runtime': 87.1492, 'train_samples_per_second': 86.461, 'train_steps_per_second': 10.843, 'train_loss': 0.245153566390749, 'epoch': 5.0}


100%|██████████| 48/48 [00:01<00:00, 43.91it/s]



 ** Evaluating metrics for python test set. ** 

Evaluation Metrics: {'eval_loss': 0.2746121883392334, 'eval_f1': 0.7404721244540559, 'eval_precision': 0.7977589343280833, 'eval_recall': 0.6931875356130865, 'eval_runtime': 1.1447, 'eval_samples_per_second': 329.337, 'eval_steps_per_second': 41.931, 'epoch': 5.0}


0,1
eval/f1,▁▄▅███
eval/loss,▇█▂▁▁▁
eval/precision,▁▂▄███
eval/recall,▁▄▅▇██
eval/runtime,▁▁▄▁▂█
eval/samples_per_second,██▅█▇▁
eval/steps_per_second,██▅█▇▁
train/epoch,▁▃▅▆███
train/global_step,▁▃▅▆███

0,1
eval/f1,0.74047
eval/loss,0.27461
eval/precision,0.79776
eval/recall,0.69319
eval/runtime,1.1447
eval/samples_per_second,329.337
eval/steps_per_second,41.931
total_flos,495648800866560.0
train/epoch,5.0
train/global_step,945.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1298/1298 [00:00<00:00, 12420.48 examples/s]
 20%|█▉        | 129/650 [00:09<00:37, 13.76it/s]
 20%|██        | 130/650 [00:10<00:37, 13.76it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.3101281523704529, 'eval_f1': 0.13944590852541036, 'eval_precision': 0.2097222222222222, 'eval_recall': 0.11753889674681753, 'eval_runtime': 0.6633, 'eval_samples_per_second': 392.002, 'eval_steps_per_second': 49.754, 'epoch': 1.0}


 40%|███▉      | 259/650 [00:21<00:27, 13.99it/s]
 40%|████      | 260/650 [00:22<00:27, 13.99it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2588973045349121, 'eval_f1': 0.32031767252355486, 'eval_precision': 0.4774477232753567, 'eval_recall': 0.2545936887721727, 'eval_runtime': 0.6452, 'eval_samples_per_second': 402.971, 'eval_steps_per_second': 51.146, 'epoch': 2.0}


 60%|█████▉    | 389/650 [00:33<00:19, 13.09it/s]
 60%|██████    | 390/650 [00:34<00:19, 13.09it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2509313225746155, 'eval_f1': 0.49315523796572475, 'eval_precision': 0.5811926296071441, 'eval_recall': 0.44304135100760117, 'eval_runtime': 0.7183, 'eval_samples_per_second': 361.973, 'eval_steps_per_second': 45.943, 'epoch': 3.0}


 80%|████████  | 520/650 [00:45<00:09, 14.06it/s]
 80%|████████  | 520/650 [00:46<00:09, 14.06it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2351602464914322, 'eval_f1': 0.523201398348527, 'eval_precision': 0.5768249473405219, 'eval_recall': 0.4829552167457073, 'eval_runtime': 0.658, 'eval_samples_per_second': 395.107, 'eval_steps_per_second': 50.148, 'epoch': 4.0}


100%|██████████| 650/650 [00:57<00:00, 13.65it/s]
100%|██████████| 650/650 [01:00<00:00, 13.65it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.23413816094398499, 'eval_f1': 0.5243569712509989, 'eval_precision': 0.5810479175996417, 'eval_recall': 0.48159355776060275, 'eval_runtime': 0.7218, 'eval_samples_per_second': 360.19, 'eval_steps_per_second': 45.716, 'epoch': 5.0}


100%|██████████| 650/650 [01:02<00:00, 10.45it/s]


{'train_runtime': 62.2163, 'train_samples_per_second': 83.419, 'train_steps_per_second': 10.447, 'train_loss': 0.21150705190805288, 'epoch': 5.0}


100%|██████████| 33/33 [00:00<00:00, 43.51it/s]



 ** Evaluating metrics for pharo test set. ** 

Evaluation Metrics: {'eval_loss': 0.23413816094398499, 'eval_f1': 0.5243569712509989, 'eval_precision': 0.5810479175996417, 'eval_recall': 0.48159355776060275, 'eval_runtime': 0.8112, 'eval_samples_per_second': 320.529, 'eval_steps_per_second': 40.683, 'epoch': 5.0}
    comparison                                    model     lan  \
0   validation  epoch-5_batchSize-8_weightsOfDecay-0.01    java   
1   validation  epoch-5_batchSize-8_weightsOfDecay-0.01    java   
2   validation  epoch-5_batchSize-8_weightsOfDecay-0.01    java   
3   validation  epoch-5_batchSize-8_weightsOfDecay-0.01    java   
4   validation  epoch-5_batchSize-8_weightsOfDecay-0.01    java   
5   validation  epoch-5_batchSize-8_weightsOfDecay-0.01    java   
6   validation  epoch-5_batchSize-8_weightsOfDecay-0.01    java   
7   validation  epoch-5_batchSize-8_weightsOfDecay-0.01  python   
8   validation  epoch-5_batchSize-8_weightsOfDecay-0.01  python   
9   validation

0,1
eval/f1,▁▄▇███
eval/loss,█▃▃▁▁▁
eval/precision,▁▆████
eval/recall,▁▄▇███
eval/runtime,▂▁▄▂▄█
eval/samples_per_second,▇█▅▇▄▁
eval/steps_per_second,▇█▅▇▄▁
train/epoch,▁▃▅▆███
train/global_step,▁▃▅▆███

0,1
eval/f1,0.52436
eval/loss,0.23414
eval/precision,0.58105
eval/recall,0.48159
eval/runtime,0.8112
eval/samples_per_second,320.529
eval/steps_per_second,40.683
total_flos,341401920192000.0
train/epoch,5.0
train/global_step,650.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 7614/7614 [00:00<00:00, 18504.42 examples/s]
  7%|▋         | 1003/15230 [01:02<14:34, 16.27it/s]

{'loss': 0.1597, 'grad_norm': 0.2742033898830414, 'learning_rate': 4.6717005909389365e-05, 'epoch': 0.66}


 10%|█         | 1523/15230 [01:35<13:54, 16.43it/s]
 10%|█         | 1523/15230 [01:41<13:54, 16.43it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.11007532477378845, 'eval_f1': 0.5902030753461814, 'eval_precision': 0.6884361645520868, 'eval_recall': 0.5609637517777433, 'eval_runtime': 5.808, 'eval_samples_per_second': 262.222, 'eval_steps_per_second': 65.599, 'epoch': 1.0}


 13%|█▎        | 2003/15230 [02:13<14:02, 15.71it/s]  

{'loss': 0.1072, 'grad_norm': 2.4805963039398193, 'learning_rate': 4.343401181877873e-05, 'epoch': 1.31}


 20%|█▉        | 3003/15230 [03:16<14:02, 14.51it/s]

{'loss': 0.0958, 'grad_norm': 0.9716637134552002, 'learning_rate': 4.015101772816809e-05, 'epoch': 1.97}


 20%|█▉        | 3045/15230 [03:19<12:19, 16.48it/s]
 20%|██        | 3046/15230 [03:25<12:19, 16.48it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.09138189256191254, 'eval_f1': 0.7760446765846533, 'eval_precision': 0.845572110141781, 'eval_recall': 0.7414169026576003, 'eval_runtime': 6.1284, 'eval_samples_per_second': 248.515, 'eval_steps_per_second': 62.17, 'epoch': 2.0}


 26%|██▋       | 4003/15230 [04:26<12:00, 15.58it/s]  

{'loss': 0.0716, 'grad_norm': 0.0505782812833786, 'learning_rate': 3.6868023637557454e-05, 'epoch': 2.63}


 30%|███       | 4569/15230 [05:02<12:00, 14.80it/s]
 30%|███       | 4569/15230 [05:09<12:00, 14.80it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.08530933409929276, 'eval_f1': 0.843537102775221, 'eval_precision': 0.8907991784724242, 'eval_recall': 0.8095283586409099, 'eval_runtime': 7.1172, 'eval_samples_per_second': 213.987, 'eval_steps_per_second': 53.532, 'epoch': 3.0}


 33%|███▎      | 5003/15230 [05:38<10:33, 16.13it/s]  

{'loss': 0.0599, 'grad_norm': 0.38465386629104614, 'learning_rate': 3.3585029546946817e-05, 'epoch': 3.28}


 39%|███▉      | 6003/15230 [06:41<09:30, 16.17it/s]

{'loss': 0.0484, 'grad_norm': 7.433000564575195, 'learning_rate': 3.030203545633618e-05, 'epoch': 3.94}


 40%|███▉      | 6091/15230 [06:47<09:10, 16.60it/s]
 40%|████      | 6092/15230 [06:53<09:10, 16.60it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.10216836631298065, 'eval_f1': 0.8551550068428547, 'eval_precision': 0.8908242771699035, 'eval_recall': 0.8277622476207174, 'eval_runtime': 5.8198, 'eval_samples_per_second': 261.693, 'eval_steps_per_second': 65.466, 'epoch': 4.0}


 46%|████▌     | 7003/15230 [07:52<09:11, 14.92it/s]  

{'loss': 0.0336, 'grad_norm': 0.006706401705741882, 'learning_rate': 2.7019041365725546e-05, 'epoch': 4.6}


 50%|█████     | 7615/15230 [08:30<07:51, 16.14it/s]
 50%|█████     | 7615/15230 [08:36<07:51, 16.14it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.10847470164299011, 'eval_f1': 0.853809714455252, 'eval_precision': 0.8696048520026923, 'eval_recall': 0.8398567662302445, 'eval_runtime': 6.0865, 'eval_samples_per_second': 250.228, 'eval_steps_per_second': 62.598, 'epoch': 5.0}


 53%|█████▎    | 8003/15230 [09:02<07:36, 15.84it/s]  

{'loss': 0.032, 'grad_norm': 0.009480156004428864, 'learning_rate': 2.3736047275114905e-05, 'epoch': 5.25}


 59%|█████▉    | 9003/15230 [10:05<06:10, 16.83it/s]

{'loss': 0.0216, 'grad_norm': 7.809123992919922, 'learning_rate': 2.045305318450427e-05, 'epoch': 5.91}


 60%|█████▉    | 9137/15230 [10:14<06:46, 15.00it/s]
 60%|██████    | 9138/15230 [10:21<06:46, 15.00it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.12644711136817932, 'eval_f1': 0.8387365796811126, 'eval_precision': 0.8550365913091653, 'eval_recall': 0.8268029686294016, 'eval_runtime': 6.7429, 'eval_samples_per_second': 225.866, 'eval_steps_per_second': 56.504, 'epoch': 6.0}


 66%|██████▌   | 10003/15230 [11:17<05:22, 16.22it/s] 

{'loss': 0.0171, 'grad_norm': 0.020755643025040627, 'learning_rate': 1.717005909389363e-05, 'epoch': 6.57}


 70%|███████   | 10661/15230 [11:58<04:27, 17.10it/s]
 70%|███████   | 10661/15230 [12:04<04:27, 17.10it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.12016332149505615, 'eval_f1': 0.8511363947106395, 'eval_precision': 0.8610251266045083, 'eval_recall': 0.8449218385548981, 'eval_runtime': 6.7962, 'eval_samples_per_second': 224.097, 'eval_steps_per_second': 56.061, 'epoch': 7.0}


 72%|███████▏  | 11003/15230 [12:27<04:34, 15.40it/s]  

{'loss': 0.0172, 'grad_norm': 0.04411223903298378, 'learning_rate': 1.3887065003282995e-05, 'epoch': 7.22}


 79%|███████▉  | 12003/15230 [13:30<03:25, 15.69it/s]

{'loss': 0.013, 'grad_norm': 0.004007680341601372, 'learning_rate': 1.0604070912672358e-05, 'epoch': 7.88}


 80%|███████▉  | 12183/15230 [13:41<03:10, 15.97it/s]
 80%|████████  | 12184/15230 [13:46<03:10, 15.97it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.13028813898563385, 'eval_f1': 0.8506719455762797, 'eval_precision': 0.8677081412415405, 'eval_recall': 0.839180506418935, 'eval_runtime': 5.5534, 'eval_samples_per_second': 274.248, 'eval_steps_per_second': 68.607, 'epoch': 8.0}


 85%|████████▌ | 13003/15230 [14:39<02:18, 16.12it/s]

{'loss': 0.0053, 'grad_norm': 0.03511672466993332, 'learning_rate': 7.321076822061721e-06, 'epoch': 8.54}


 90%|█████████ | 13707/15230 [15:23<01:38, 15.46it/s]
 90%|█████████ | 13707/15230 [15:30<01:38, 15.46it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.1259148120880127, 'eval_f1': 0.8660405527320924, 'eval_precision': 0.8720072148832249, 'eval_recall': 0.8627853205589106, 'eval_runtime': 6.2508, 'eval_samples_per_second': 243.65, 'eval_steps_per_second': 60.952, 'epoch': 9.0}


 92%|█████████▏| 14003/15230 [15:50<01:18, 15.72it/s]

{'loss': 0.0069, 'grad_norm': 0.013757200911641121, 'learning_rate': 4.038082731451084e-06, 'epoch': 9.19}


 99%|█████████▊| 15003/15230 [16:53<00:14, 15.37it/s]

{'loss': 0.0069, 'grad_norm': 0.001989788142964244, 'learning_rate': 7.550886408404465e-07, 'epoch': 9.85}


100%|█████████▉| 15229/15230 [17:07<00:00, 16.00it/s]
100%|██████████| 15230/15230 [17:16<00:00, 16.00it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.12557508051395416, 'eval_f1': 0.8660274088205008, 'eval_precision': 0.8771314301053222, 'eval_recall': 0.8587786563557191, 'eval_runtime': 7.5793, 'eval_samples_per_second': 200.941, 'eval_steps_per_second': 50.268, 'epoch': 10.0}


100%|██████████| 15230/15230 [17:18<00:00, 14.66it/s]


{'train_runtime': 1038.6113, 'train_samples_per_second': 58.646, 'train_steps_per_second': 14.664, 'train_loss': 0.045770075837014465, 'epoch': 10.0}


100%|██████████| 381/381 [00:05<00:00, 65.21it/s]



 ** Evaluating metrics for java test set. ** 

Evaluation Metrics: {'eval_loss': 0.1259148120880127, 'eval_f1': 0.8660405527320924, 'eval_precision': 0.8720072148832249, 'eval_recall': 0.8627853205589106, 'eval_runtime': 5.8756, 'eval_samples_per_second': 259.206, 'eval_steps_per_second': 64.844, 'epoch': 10.0}


0,1
eval/f1,▁▆▇██▇█████
eval/loss,▅▂▁▄▅▇▆█▇▇▇
eval/precision,▁▆██▇▇▇▇▇█▇
eval/recall,▁▅▇▇▇▇█▇███
eval/runtime,▂▃▆▂▃▅▅▁▃█▂
eval/samples_per_second,▇▆▂▇▆▃▃█▅▁▇
eval/steps_per_second,▇▆▂▇▆▃▃█▅▁▇
train/epoch,▁▁▁▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇▇████
train/global_step,▁▁▁▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇▇████
train/grad_norm,▁▃▂▁▁█▁▁█▁▁▁▁▁▁

0,1
eval/f1,0.86604
eval/loss,0.12591
eval/precision,0.87201
eval/recall,0.86279
eval/runtime,5.8756
eval/samples_per_second,259.206
eval/steps_per_second,64.844
total_flos,4006703460288000.0
train/epoch,10.0
train/global_step,15230.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1884/1884 [00:00<00:00, 11533.37 examples/s]
 10%|█         | 377/3770 [00:23<03:35, 15.74it/s]
 10%|█         | 377/3770 [00:25<03:35, 15.74it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.35528668761253357, 'eval_f1': 0.33983840836913887, 'eval_precision': 0.5497777777777777, 'eval_recall': 0.2917577752260193, 'eval_runtime': 1.771, 'eval_samples_per_second': 212.872, 'eval_steps_per_second': 53.641, 'epoch': 1.0}


 20%|█▉        | 753/3770 [00:49<03:17, 15.30it/s]
 20%|██        | 754/3770 [00:51<03:17, 15.30it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.30476072430610657, 'eval_f1': 0.5383386505967152, 'eval_precision': 0.6678908058698999, 'eval_recall': 0.48363171269324823, 'eval_runtime': 1.9322, 'eval_samples_per_second': 195.111, 'eval_steps_per_second': 49.166, 'epoch': 2.0}


 27%|██▋       | 1003/3770 [01:09<02:57, 15.59it/s]

{'loss': 0.3393, 'grad_norm': 55.69825744628906, 'learning_rate': 3.673740053050398e-05, 'epoch': 2.65}


 30%|███       | 1131/3770 [01:17<02:58, 14.81it/s]
 30%|███       | 1131/3770 [01:19<02:58, 14.81it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.31309589743614197, 'eval_f1': 0.6334249017076683, 'eval_precision': 0.7929999999999999, 'eval_recall': 0.590716382945659, 'eval_runtime': 1.6138, 'eval_samples_per_second': 233.606, 'eval_steps_per_second': 58.866, 'epoch': 3.0}


 40%|███▉      | 1507/3770 [01:44<02:26, 15.47it/s]
 40%|████      | 1508/3770 [01:46<02:26, 15.47it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.32463932037353516, 'eval_f1': 0.6978712601865676, 'eval_precision': 0.7540954556994162, 'eval_recall': 0.6575109520129554, 'eval_runtime': 1.8189, 'eval_samples_per_second': 207.272, 'eval_steps_per_second': 52.23, 'epoch': 4.0}


 50%|█████     | 1885/3770 [02:12<01:48, 17.37it/s]
 50%|█████     | 1885/3770 [02:13<01:48, 17.37it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.35005122423171997, 'eval_f1': 0.7260646210665089, 'eval_precision': 0.7680608280608281, 'eval_recall': 0.6918072268753139, 'eval_runtime': 1.8251, 'eval_samples_per_second': 206.563, 'eval_steps_per_second': 52.052, 'epoch': 5.0}


 53%|█████▎    | 2003/3770 [02:23<01:46, 16.64it/s]

{'loss': 0.144, 'grad_norm': 0.06204480677843094, 'learning_rate': 2.347480106100796e-05, 'epoch': 5.31}


 60%|█████▉    | 2261/3770 [02:39<01:36, 15.71it/s]
 60%|██████    | 2262/3770 [02:41<01:36, 15.71it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.3824194073677063, 'eval_f1': 0.7349763145365902, 'eval_precision': 0.7719577860427508, 'eval_recall': 0.7234482866926422, 'eval_runtime': 1.7156, 'eval_samples_per_second': 219.752, 'eval_steps_per_second': 55.375, 'epoch': 6.0}


 70%|███████   | 2639/3770 [03:05<01:08, 16.57it/s]
 70%|███████   | 2639/3770 [03:07<01:08, 16.57it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.4119740426540375, 'eval_f1': 0.7542330017942938, 'eval_precision': 0.7738324862992794, 'eval_recall': 0.7394381716970921, 'eval_runtime': 1.5131, 'eval_samples_per_second': 249.158, 'eval_steps_per_second': 62.785, 'epoch': 7.0}


 80%|███████▉  | 3003/3770 [03:33<00:45, 16.70it/s]

{'loss': 0.0478, 'grad_norm': 0.10605884343385696, 'learning_rate': 1.0212201591511936e-05, 'epoch': 7.96}


 80%|███████▉  | 3015/3770 [03:33<00:44, 17.13it/s]
 80%|████████  | 3016/3770 [03:35<00:44, 17.13it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.41452154517173767, 'eval_f1': 0.7539386562458843, 'eval_precision': 0.7646518759018759, 'eval_recall': 0.7613933087595711, 'eval_runtime': 1.4069, 'eval_samples_per_second': 267.956, 'eval_steps_per_second': 67.522, 'epoch': 8.0}


 90%|█████████ | 3393/3770 [04:00<00:23, 16.11it/s]
 90%|█████████ | 3393/3770 [04:01<00:23, 16.11it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.4322541356086731, 'eval_f1': 0.758241179763316, 'eval_precision': 0.7698073658943224, 'eval_recall': 0.7491639301283841, 'eval_runtime': 1.3539, 'eval_samples_per_second': 278.455, 'eval_steps_per_second': 70.168, 'epoch': 9.0}


100%|█████████▉| 3769/3770 [04:26<00:00, 15.86it/s]
100%|██████████| 3770/3770 [04:29<00:00, 15.86it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.4300021529197693, 'eval_f1': 0.7572520233389797, 'eval_precision': 0.7632191425473586, 'eval_recall': 0.7586171527860304, 'eval_runtime': 1.3828, 'eval_samples_per_second': 272.629, 'eval_steps_per_second': 68.7, 'epoch': 10.0}


100%|██████████| 3770/3770 [04:31<00:00, 13.87it/s]


{'train_runtime': 271.8951, 'train_samples_per_second': 55.426, 'train_steps_per_second': 13.866, 'train_loss': 0.14441664086096484, 'epoch': 10.0}


100%|██████████| 95/95 [00:01<00:00, 55.22it/s]



 ** Evaluating metrics for python test set. ** 

Evaluation Metrics: {'eval_loss': 0.4322541356086731, 'eval_f1': 0.758241179763316, 'eval_precision': 0.7698073658943224, 'eval_recall': 0.7491639301283841, 'eval_runtime': 1.754, 'eval_samples_per_second': 214.932, 'eval_steps_per_second': 54.161, 'epoch': 10.0}


0,1
eval/f1,▁▄▆▇▇██████
eval/loss,▄▁▁▂▃▅▇▇███
eval/precision,▁▄█▇▇▇▇▇▇▇▇
eval/recall,▁▄▅▆▇▇█████
eval/runtime,▆█▄▇▇▅▃▂▁▁▆
eval/samples_per_second,▂▁▄▂▂▃▆▇██▃
eval/steps_per_second,▂▁▄▂▂▃▆▇██▃
train/epoch,▁▂▂▃▃▄▄▅▆▆▆▇███
train/global_step,▁▂▂▃▃▄▄▅▆▆▆▇███
train/grad_norm,█▁▁

0,1
eval/f1,0.75824
eval/loss,0.43225
eval/precision,0.76981
eval/recall,0.74916
eval/runtime,1.754
eval/samples_per_second,214.932
eval/steps_per_second,54.161
total_flos,991297601733120.0
train/epoch,10.0
train/global_step,3770.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1298/1298 [00:00<00:00, 12487.89 examples/s]
 10%|▉         | 259/2600 [00:16<02:38, 14.80it/s]
 10%|█         | 260/2600 [00:18<02:38, 14.80it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2928790748119354, 'eval_f1': 0.2463874453557578, 'eval_precision': 0.3160183226844726, 'eval_recall': 0.21270751135263904, 'eval_runtime': 1.3328, 'eval_samples_per_second': 195.079, 'eval_steps_per_second': 48.77, 'epoch': 1.0}


 20%|█▉        | 519/2600 [00:35<02:01, 17.08it/s]
 20%|██        | 520/2600 [00:36<02:01, 17.08it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.25544461607933044, 'eval_f1': 0.4587764658534188, 'eval_precision': 0.5758048780902708, 'eval_recall': 0.39113402543947523, 'eval_runtime': 0.9781, 'eval_samples_per_second': 265.831, 'eval_steps_per_second': 66.458, 'epoch': 2.0}


 30%|██▉       | 779/2600 [00:55<01:58, 15.36it/s]
 30%|███       | 780/2600 [00:56<01:58, 15.36it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.24882471561431885, 'eval_f1': 0.4837603235419391, 'eval_precision': 0.5756863366477086, 'eval_recall': 0.4300580802875406, 'eval_runtime': 1.0427, 'eval_samples_per_second': 249.358, 'eval_steps_per_second': 62.339, 'epoch': 3.0}


 39%|███▊      | 1003/2600 [01:12<01:42, 15.55it/s]

{'loss': 0.2238, 'grad_norm': 0.2532604932785034, 'learning_rate': 3.0769230769230774e-05, 'epoch': 3.85}


 40%|███▉      | 1039/2600 [01:14<01:34, 16.45it/s]
 40%|████      | 1040/2600 [01:15<01:34, 16.45it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.24002902209758759, 'eval_f1': 0.5212594059023458, 'eval_precision': 0.5537376182357001, 'eval_recall': 0.494401512825369, 'eval_runtime': 1.1106, 'eval_samples_per_second': 234.111, 'eval_steps_per_second': 58.528, 'epoch': 4.0}


 50%|████▉     | 1299/2600 [01:33<01:18, 16.61it/s]
 50%|█████     | 1300/2600 [01:34<01:18, 16.61it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.24601681530475616, 'eval_f1': 0.535117353850768, 'eval_precision': 0.5695893904333802, 'eval_recall': 0.5053059263464569, 'eval_runtime': 1.2429, 'eval_samples_per_second': 209.195, 'eval_steps_per_second': 52.299, 'epoch': 5.0}


 60%|█████▉    | 1559/2600 [01:52<01:01, 16.80it/s]
 60%|██████    | 1560/2600 [01:53<01:01, 16.80it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2501889765262604, 'eval_f1': 0.6400330870081182, 'eval_precision': 0.8491383614342973, 'eval_recall': 0.5749261262940976, 'eval_runtime': 1.0027, 'eval_samples_per_second': 259.298, 'eval_steps_per_second': 64.825, 'epoch': 6.0}


 70%|██████▉   | 1819/2600 [02:11<00:49, 15.92it/s]
 70%|███████   | 1820/2600 [02:13<00:48, 15.92it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.24531517922878265, 'eval_f1': 0.6517919905824169, 'eval_precision': 0.834844723093571, 'eval_recall': 0.5790832780286507, 'eval_runtime': 1.2022, 'eval_samples_per_second': 216.264, 'eval_steps_per_second': 54.066, 'epoch': 7.0}


 77%|███████▋  | 2003/2600 [02:27<00:38, 15.32it/s]

{'loss': 0.0767, 'grad_norm': 0.04852132871747017, 'learning_rate': 1.153846153846154e-05, 'epoch': 7.69}


 80%|███████▉  | 2079/2600 [02:31<00:33, 15.52it/s]
 80%|████████  | 2080/2600 [02:33<00:33, 15.52it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2502407431602478, 'eval_f1': 0.6441950321639485, 'eval_precision': 0.7466123974093735, 'eval_recall': 0.5921727910615867, 'eval_runtime': 1.2612, 'eval_samples_per_second': 206.157, 'eval_steps_per_second': 51.539, 'epoch': 8.0}


 90%|████████▉ | 2339/2600 [02:51<00:16, 15.65it/s]
 90%|█████████ | 2340/2600 [02:52<00:16, 15.65it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2516332268714905, 'eval_f1': 0.6323829358172329, 'eval_precision': 0.7996249098642715, 'eval_recall': 0.5857961351360584, 'eval_runtime': 0.9605, 'eval_samples_per_second': 270.682, 'eval_steps_per_second': 67.671, 'epoch': 9.0}


100%|█████████▉| 2599/2600 [03:10<00:00, 15.69it/s]
100%|██████████| 2600/2600 [03:13<00:00, 15.69it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2514691650867462, 'eval_f1': 0.6601022153729527, 'eval_precision': 0.7437475878416789, 'eval_recall': 0.6096841474718123, 'eval_runtime': 1.1342, 'eval_samples_per_second': 229.243, 'eval_steps_per_second': 57.311, 'epoch': 10.0}


100%|██████████| 2600/2600 [03:16<00:00, 13.26it/s]


{'train_runtime': 196.0592, 'train_samples_per_second': 52.943, 'train_steps_per_second': 13.261, 'train_loss': 0.12337329057546763, 'epoch': 10.0}


100%|██████████| 65/65 [00:01<00:00, 64.15it/s]



 ** Evaluating metrics for pharo test set. ** 

Evaluation Metrics: {'eval_loss': 0.2514691650867462, 'eval_f1': 0.6601022153729527, 'eval_precision': 0.7437475878416789, 'eval_recall': 0.6096841474718123, 'eval_runtime': 1.0484, 'eval_samples_per_second': 247.993, 'eval_steps_per_second': 61.998, 'epoch': 10.0}
    comparison                                     model     lan  \
0   validation  epoch-10_batchSize-4_weightsOfDecay-0.01    java   
1   validation  epoch-10_batchSize-4_weightsOfDecay-0.01    java   
2   validation  epoch-10_batchSize-4_weightsOfDecay-0.01    java   
3   validation  epoch-10_batchSize-4_weightsOfDecay-0.01    java   
4   validation  epoch-10_batchSize-4_weightsOfDecay-0.01    java   
5   validation  epoch-10_batchSize-4_weightsOfDecay-0.01    java   
6   validation  epoch-10_batchSize-4_weightsOfDecay-0.01    java   
7   validation  epoch-10_batchSize-4_weightsOfDecay-0.01  python   
8   validation  epoch-10_batchSize-4_weightsOfDecay-0.01  python   
9   v

0,1
eval/f1,▁▅▅▆▆██████
eval/loss,█▃▂▁▂▂▂▂▃▃▃
eval/precision,▁▄▄▄▄██▇▇▇▇
eval/recall,▁▄▅▆▆▇▇████
eval/runtime,█▁▃▄▆▂▆▇▁▄▃
eval/samples_per_second,▁█▆▅▂▇▃▂█▄▆
eval/steps_per_second,▁█▆▅▂▇▃▂█▄▆
train/epoch,▁▂▃▃▃▄▅▆▆▆▇███
train/global_step,▁▂▃▃▃▄▅▆▆▆▇███
train/grad_norm,█▁

0,1
eval/f1,0.6601
eval/loss,0.25147
eval/precision,0.74375
eval/recall,0.60968
eval/runtime,1.0484
eval/samples_per_second,247.993
eval/steps_per_second,61.998
total_flos,682803840384000.0
train/epoch,10.0
train/global_step,2600.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 7614/7614 [00:00<00:00, 17830.70 examples/s]
 10%|▉         | 761/7620 [00:56<08:20, 13.70it/s]
 10%|█         | 762/7620 [01:00<08:20, 13.70it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.11086812615394592, 'eval_f1': 0.5315733814536376, 'eval_precision': 0.5375057172533048, 'eval_recall': 0.5266704616165224, 'eval_runtime': 3.924, 'eval_samples_per_second': 388.12, 'eval_steps_per_second': 48.674, 'epoch': 1.0}


 13%|█▎        | 1001/7620 [01:21<08:21, 13.20it/s] 

{'loss': 0.1331, 'grad_norm': 3.2133519649505615, 'learning_rate': 4.343832020997376e-05, 'epoch': 1.31}


 20%|█▉        | 1523/7620 [02:00<07:28, 13.61it/s]
 20%|██        | 1524/7620 [02:04<07:28, 13.61it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.08986184746026993, 'eval_f1': 0.7637657546455482, 'eval_precision': 0.8479532228253006, 'eval_recall': 0.7255979385952658, 'eval_runtime': 3.8721, 'eval_samples_per_second': 393.326, 'eval_steps_per_second': 49.327, 'epoch': 2.0}


 26%|██▋       | 2002/7620 [02:42<07:05, 13.20it/s]  

{'loss': 0.0752, 'grad_norm': 0.1262432485818863, 'learning_rate': 3.6876640419947505e-05, 'epoch': 2.62}


 30%|███       | 2286/7620 [03:03<06:39, 13.36it/s]
 30%|███       | 2286/7620 [03:07<06:39, 13.36it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.0824332982301712, 'eval_f1': 0.830873998901475, 'eval_precision': 0.852826563792349, 'eval_recall': 0.8130157257342228, 'eval_runtime': 3.9854, 'eval_samples_per_second': 382.142, 'eval_steps_per_second': 47.925, 'epoch': 3.0}


 39%|███▉      | 3002/7620 [04:04<05:52, 13.09it/s]  

{'loss': 0.047, 'grad_norm': 0.7246479392051697, 'learning_rate': 3.0314960629921263e-05, 'epoch': 3.94}


 40%|████      | 3048/7620 [04:08<05:34, 13.68it/s]
 40%|████      | 3048/7620 [04:12<05:34, 13.68it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.08915539085865021, 'eval_f1': 0.8592287770306024, 'eval_precision': 0.8901352001792459, 'eval_recall': 0.839147354023419, 'eval_runtime': 4.0336, 'eval_samples_per_second': 377.579, 'eval_steps_per_second': 47.352, 'epoch': 4.0}


 50%|█████     | 3810/7620 [05:13<04:51, 13.08it/s]  
 50%|█████     | 3810/7620 [05:17<04:51, 13.08it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.09318532794713974, 'eval_f1': 0.8438173184635633, 'eval_precision': 0.8640275424489146, 'eval_recall': 0.8253590901111237, 'eval_runtime': 4.0603, 'eval_samples_per_second': 375.1, 'eval_steps_per_second': 47.041, 'epoch': 5.0}


 53%|█████▎    | 4002/7620 [05:36<04:40, 12.92it/s]  

{'loss': 0.0258, 'grad_norm': 0.14678247272968292, 'learning_rate': 2.3753280839895015e-05, 'epoch': 5.25}


 60%|██████    | 4572/7620 [06:19<03:43, 13.62it/s]
 60%|██████    | 4572/7620 [06:23<03:43, 13.62it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.09985269606113434, 'eval_f1': 0.8528705323206432, 'eval_precision': 0.8719219839608624, 'eval_recall': 0.839174973457611, 'eval_runtime': 3.9977, 'eval_samples_per_second': 380.973, 'eval_steps_per_second': 47.778, 'epoch': 6.0}


 66%|██████▌   | 5002/7620 [06:57<03:26, 12.66it/s]

{'loss': 0.0151, 'grad_norm': 0.011467767879366875, 'learning_rate': 1.7191601049868766e-05, 'epoch': 6.56}


 70%|███████   | 5334/7620 [07:22<02:51, 13.29it/s]
 70%|███████   | 5334/7620 [07:26<02:51, 13.29it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.10582815855741501, 'eval_f1': 0.8573420553335013, 'eval_precision': 0.8530344302777949, 'eval_recall': 0.8632318842422106, 'eval_runtime': 4.1547, 'eval_samples_per_second': 366.571, 'eval_steps_per_second': 45.972, 'epoch': 7.0}


 79%|███████▉  | 6002/7620 [08:20<02:02, 13.22it/s]

{'loss': 0.0098, 'grad_norm': 0.010094031691551208, 'learning_rate': 1.062992125984252e-05, 'epoch': 7.87}


 80%|████████  | 6096/7620 [08:27<01:50, 13.77it/s]
 80%|████████  | 6096/7620 [08:31<01:50, 13.77it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.10920780152082443, 'eval_f1': 0.8645713948121323, 'eval_precision': 0.8662480220651397, 'eval_recall': 0.863942551477886, 'eval_runtime': 3.9269, 'eval_samples_per_second': 387.842, 'eval_steps_per_second': 48.639, 'epoch': 8.0}


 90%|█████████ | 6858/7620 [09:31<00:57, 13.26it/s]
 90%|█████████ | 6858/7620 [09:35<00:57, 13.26it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.11376088857650757, 'eval_f1': 0.8676310826937605, 'eval_precision': 0.8739984828664316, 'eval_recall': 0.8625236192703455, 'eval_runtime': 4.1461, 'eval_samples_per_second': 367.33, 'eval_steps_per_second': 46.067, 'epoch': 9.0}


 92%|█████████▏| 7002/7620 [09:48<00:48, 12.87it/s]

{'loss': 0.0042, 'grad_norm': 0.09863263368606567, 'learning_rate': 4.068241469816273e-06, 'epoch': 9.19}


100%|██████████| 7620/7620 [10:34<00:00, 14.09it/s]
100%|██████████| 7620/7620 [10:40<00:00, 14.09it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.11462666839361191, 'eval_f1': 0.8739213792361495, 'eval_precision': 0.8850767151518503, 'eval_recall': 0.8642864247079388, 'eval_runtime': 4.0494, 'eval_samples_per_second': 376.103, 'eval_steps_per_second': 47.167, 'epoch': 10.0}


100%|██████████| 7620/7620 [10:42<00:00, 11.85it/s]


{'train_runtime': 642.8947, 'train_samples_per_second': 94.743, 'train_steps_per_second': 11.853, 'train_loss': 0.041042160956565474, 'epoch': 10.0}


100%|██████████| 191/191 [00:03<00:00, 49.95it/s]



 ** Evaluating metrics for java test set. ** 

Evaluation Metrics: {'eval_loss': 0.11462666839361191, 'eval_f1': 0.8739213792361495, 'eval_precision': 0.8850767151518503, 'eval_recall': 0.8642864247079388, 'eval_runtime': 3.8797, 'eval_samples_per_second': 392.553, 'eval_steps_per_second': 49.23, 'epoch': 10.0}


0,1
eval/f1,▁▆▇█▇██████
eval/loss,▇▃▁▂▃▅▆▇███
eval/precision,▁▇▇█▇█▇████
eval/recall,▁▅▇▇▇▇█████
eval/runtime,▂▁▄▅▆▄█▂█▅▁
eval/samples_per_second,▇█▅▄▃▅▁▇▁▃█
eval/steps_per_second,▇█▅▄▃▅▁▇▁▃█
train/epoch,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇███
train/global_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇███
train/grad_norm,█▁▃▁▁▁▁

0,1
eval/f1,0.87392
eval/loss,0.11463
eval/precision,0.88508
eval/recall,0.86429
eval/runtime,3.8797
eval/samples_per_second,392.553
eval/steps_per_second,49.23
total_flos,4006703460288000.0
train/epoch,10.0
train/global_step,7620.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1884/1884 [00:00<00:00, 4093.83 examples/s]
 10%|█         | 189/1890 [00:13<01:57, 14.52it/s]
 10%|█         | 189/1890 [00:14<01:57, 14.52it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.3375987112522125, 'eval_f1': 0.38351431821282866, 'eval_precision': 0.5739170506912442, 'eval_recall': 0.29090524979379273, 'eval_runtime': 0.9389, 'eval_samples_per_second': 401.514, 'eval_steps_per_second': 51.121, 'epoch': 1.0}


 20%|█▉        | 377/1890 [00:31<01:51, 13.57it/s]
 20%|██        | 378/1890 [00:32<01:51, 13.57it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.291387140750885, 'eval_f1': 0.558984421249638, 'eval_precision': 0.6539434788754123, 'eval_recall': 0.4946897738318497, 'eval_runtime': 0.9838, 'eval_samples_per_second': 383.195, 'eval_steps_per_second': 48.789, 'epoch': 2.0}


 30%|███       | 567/1890 [00:50<01:32, 14.23it/s]
 30%|███       | 567/1890 [00:51<01:32, 14.23it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.30433326959609985, 'eval_f1': 0.559456751852361, 'eval_precision': 0.8280548697480994, 'eval_recall': 0.5149815163539512, 'eval_runtime': 0.9564, 'eval_samples_per_second': 394.18, 'eval_steps_per_second': 50.187, 'epoch': 3.0}


 40%|███▉      | 755/1890 [01:07<01:21, 13.89it/s]
 40%|████      | 756/1890 [01:08<01:21, 13.89it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.3210723102092743, 'eval_f1': 0.6573015479280269, 'eval_precision': 0.7605095114529077, 'eval_recall': 0.6127918292475119, 'eval_runtime': 0.9477, 'eval_samples_per_second': 397.823, 'eval_steps_per_second': 50.651, 'epoch': 4.0}


 50%|█████     | 945/1890 [01:27<01:06, 14.25it/s]
 50%|█████     | 945/1890 [01:28<01:06, 14.25it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.31247764825820923, 'eval_f1': 0.7252812432776952, 'eval_precision': 0.7584006211180124, 'eval_recall': 0.7020662346569272, 'eval_runtime': 0.9526, 'eval_samples_per_second': 395.763, 'eval_steps_per_second': 50.389, 'epoch': 5.0}


 53%|█████▎    | 1001/1890 [01:33<01:05, 13.49it/s]

{'loss': 0.2378, 'grad_norm': 3.0889430046081543, 'learning_rate': 2.3544973544973546e-05, 'epoch': 5.29}


 60%|█████▉    | 1133/1890 [01:43<00:58, 13.03it/s]
 60%|██████    | 1134/1890 [01:44<00:58, 13.03it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.34980252385139465, 'eval_f1': 0.7345253112587817, 'eval_precision': 0.7767202718645276, 'eval_recall': 0.7027302096715708, 'eval_runtime': 0.9708, 'eval_samples_per_second': 388.33, 'eval_steps_per_second': 49.443, 'epoch': 6.0}


 70%|███████   | 1323/1890 [02:00<00:41, 13.73it/s]
 70%|███████   | 1323/1890 [02:01<00:41, 13.73it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.37253281474113464, 'eval_f1': 0.7488038873413453, 'eval_precision': 0.7682763075826429, 'eval_recall': 0.741405852301001, 'eval_runtime': 0.9497, 'eval_samples_per_second': 396.954, 'eval_steps_per_second': 50.541, 'epoch': 7.0}


 80%|███████▉  | 1511/1890 [02:17<00:27, 13.73it/s]
 80%|████████  | 1512/1890 [02:18<00:27, 13.73it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.36439356207847595, 'eval_f1': 0.7527956617840585, 'eval_precision': 0.7690432135295433, 'eval_recall': 0.7484976023939963, 'eval_runtime': 1.0432, 'eval_samples_per_second': 361.393, 'eval_steps_per_second': 46.013, 'epoch': 8.0}


 90%|████████▉ | 1700/1890 [02:34<00:14, 13.02it/s]
 90%|█████████ | 1701/1890 [02:35<00:14, 13.02it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.37875083088874817, 'eval_f1': 0.7600198699369007, 'eval_precision': 0.7694466147663079, 'eval_recall': 0.760426822036411, 'eval_runtime': 1.0103, 'eval_samples_per_second': 373.155, 'eval_steps_per_second': 47.51, 'epoch': 9.0}


100%|█████████▉| 1889/1890 [02:52<00:00, 13.72it/s]
100%|██████████| 1890/1890 [02:55<00:00, 13.72it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.3802606463432312, 'eval_f1': 0.763774723562, 'eval_precision': 0.7734607551203704, 'eval_recall': 0.7627179913504295, 'eval_runtime': 1.139, 'eval_samples_per_second': 330.989, 'eval_steps_per_second': 42.142, 'epoch': 10.0}


100%|██████████| 1890/1890 [02:57<00:00, 10.62it/s]


{'train_runtime': 178.0011, 'train_samples_per_second': 84.662, 'train_steps_per_second': 10.618, 'train_loss': 0.14253346074825873, 'epoch': 10.0}


100%|██████████| 48/48 [00:01<00:00, 43.92it/s]



 ** Evaluating metrics for python test set. ** 

Evaluation Metrics: {'eval_loss': 0.3802606463432312, 'eval_f1': 0.763774723562, 'eval_precision': 0.7734607551203704, 'eval_recall': 0.7627179913504295, 'eval_runtime': 1.1472, 'eval_samples_per_second': 328.632, 'eval_steps_per_second': 41.842, 'epoch': 10.0}


0,1
eval/f1,▁▄▄▆▇▇█████
eval/loss,▅▁▂▃▃▆▇▇███
eval/precision,▁▃█▆▆▇▆▆▆▆▆
eval/recall,▁▄▄▆▇▇█████
eval/runtime,▁▃▂▁▁▂▁▅▃██
eval/samples_per_second,█▆▇█▇▇█▄▅▁▁
eval/steps_per_second,█▆▇█▇▇█▄▅▁▁
train/epoch,▁▂▃▃▄▄▅▆▆▇███
train/global_step,▁▂▃▃▄▄▅▆▆▇███
train/grad_norm,▁

0,1
eval/f1,0.76377
eval/loss,0.38026
eval/precision,0.77346
eval/recall,0.76272
eval/runtime,1.1472
eval/samples_per_second,328.632
eval/steps_per_second,41.842
total_flos,991297601733120.0
train/epoch,10.0
train/global_step,1890.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1298/1298 [00:00<00:00, 10985.75 examples/s]
 10%|▉         | 129/1300 [00:09<01:26, 13.53it/s]
 10%|█         | 130/1300 [00:10<01:26, 13.53it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.30981549620628357, 'eval_f1': 0.11892675852066714, 'eval_precision': 0.12202380952380952, 'eval_recall': 0.11598302687411599, 'eval_runtime': 0.672, 'eval_samples_per_second': 386.913, 'eval_steps_per_second': 49.108, 'epoch': 1.0}


 20%|█▉        | 259/1300 [00:21<01:18, 13.21it/s]
 20%|██        | 260/1300 [00:22<01:18, 13.21it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.256124883890152, 'eval_f1': 0.44793670307468564, 'eval_precision': 0.6031933504602506, 'eval_recall': 0.3705852385636214, 'eval_runtime': 0.7298, 'eval_samples_per_second': 356.274, 'eval_steps_per_second': 45.219, 'epoch': 2.0}


 30%|██▉       | 389/1300 [00:33<01:07, 13.58it/s]
 30%|███       | 390/1300 [00:34<01:06, 13.58it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2330305427312851, 'eval_f1': 0.503564130251988, 'eval_precision': 0.5704739602914065, 'eval_recall': 0.4536534420405041, 'eval_runtime': 0.6581, 'eval_samples_per_second': 395.071, 'eval_steps_per_second': 50.144, 'epoch': 3.0}


 40%|███▉      | 519/1300 [00:45<00:58, 13.35it/s]
 40%|████      | 520/1300 [00:46<00:58, 13.35it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.22522272169589996, 'eval_f1': 0.6183768306139753, 'eval_precision': 0.8240020791100938, 'eval_recall': 0.5650555283149038, 'eval_runtime': 0.7398, 'eval_samples_per_second': 351.423, 'eval_steps_per_second': 44.604, 'epoch': 4.0}


 50%|████▉     | 649/1300 [00:57<00:47, 13.76it/s]
 50%|█████     | 650/1300 [00:58<00:47, 13.76it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.23171879351139069, 'eval_f1': 0.5675504348463943, 'eval_precision': 0.7477889046437433, 'eval_recall': 0.5172275266697545, 'eval_runtime': 0.668, 'eval_samples_per_second': 389.199, 'eval_steps_per_second': 49.398, 'epoch': 5.0}


 60%|██████    | 780/1300 [01:09<00:39, 13.26it/s]
 60%|██████    | 780/1300 [01:10<00:39, 13.26it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.22391551733016968, 'eval_f1': 0.6485154929610267, 'eval_precision': 0.8318027368247899, 'eval_recall': 0.5946178870692336, 'eval_runtime': 0.76, 'eval_samples_per_second': 342.091, 'eval_steps_per_second': 43.419, 'epoch': 6.0}


 70%|███████   | 910/1300 [01:21<00:28, 13.64it/s]
 70%|███████   | 910/1300 [01:22<00:28, 13.64it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.22690364718437195, 'eval_f1': 0.6576772644930318, 'eval_precision': 0.869981052452374, 'eval_recall': 0.5913140317162645, 'eval_runtime': 0.6771, 'eval_samples_per_second': 384.006, 'eval_steps_per_second': 48.739, 'epoch': 7.0}


 77%|███████▋  | 1001/1300 [01:31<00:22, 13.41it/s]

{'loss': 0.1552, 'grad_norm': 0.9271907210350037, 'learning_rate': 1.153846153846154e-05, 'epoch': 7.69}


 80%|███████▉  | 1039/1300 [01:33<00:19, 13.12it/s]
 80%|████████  | 1040/1300 [01:34<00:19, 13.12it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.23856432735919952, 'eval_f1': 0.6603418125649397, 'eval_precision': 0.7573890089571051, 'eval_recall': 0.6214056757048058, 'eval_runtime': 0.7362, 'eval_samples_per_second': 353.157, 'eval_steps_per_second': 44.824, 'epoch': 8.0}


 90%|████████▉ | 1169/1300 [01:48<00:09, 13.74it/s]
 90%|█████████ | 1170/1300 [01:49<00:09, 13.74it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.23017580807209015, 'eval_f1': 0.6903460290572092, 'eval_precision': 0.8018711943225595, 'eval_recall': 0.6363470565124478, 'eval_runtime': 0.6732, 'eval_samples_per_second': 386.189, 'eval_steps_per_second': 49.016, 'epoch': 9.0}


100%|█████████▉| 1299/1300 [02:00<00:00, 13.72it/s]
100%|██████████| 1300/1300 [02:02<00:00, 13.72it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.22841039299964905, 'eval_f1': 0.6946755969594316, 'eval_precision': 0.8064119638793249, 'eval_recall': 0.6388499190435989, 'eval_runtime': 0.7532, 'eval_samples_per_second': 345.216, 'eval_steps_per_second': 43.816, 'epoch': 10.0}


100%|██████████| 1300/1300 [02:05<00:00, 10.39it/s]


{'train_runtime': 125.1751, 'train_samples_per_second': 82.924, 'train_steps_per_second': 10.385, 'train_loss': 0.12794013903691218, 'epoch': 10.0}


100%|██████████| 33/33 [00:00<00:00, 42.24it/s]



 ** Evaluating metrics for pharo test set. ** 

Evaluation Metrics: {'eval_loss': 0.22841039299964905, 'eval_f1': 0.6946755969594316, 'eval_precision': 0.8064119638793249, 'eval_recall': 0.6388499190435989, 'eval_runtime': 0.8436, 'eval_samples_per_second': 308.22, 'eval_steps_per_second': 39.12, 'epoch': 10.0}
    comparison                                     model     lan  \
0   validation  epoch-10_batchSize-8_weightsOfDecay-0.01    java   
1   validation  epoch-10_batchSize-8_weightsOfDecay-0.01    java   
2   validation  epoch-10_batchSize-8_weightsOfDecay-0.01    java   
3   validation  epoch-10_batchSize-8_weightsOfDecay-0.01    java   
4   validation  epoch-10_batchSize-8_weightsOfDecay-0.01    java   
5   validation  epoch-10_batchSize-8_weightsOfDecay-0.01    java   
6   validation  epoch-10_batchSize-8_weightsOfDecay-0.01    java   
7   validation  epoch-10_batchSize-8_weightsOfDecay-0.01  python   
8   validation  epoch-10_batchSize-8_weightsOfDecay-0.01  python   
9   va

0,1
eval/f1,▁▅▆▇▆▇█████
eval/loss,█▄▂▁▂▁▁▂▂▁▁
eval/precision,▁▆▅█▇██▇▇▇▇
eval/recall,▁▄▆▇▆▇▇████
eval/runtime,▂▄▁▄▁▅▂▄▂▅█
eval/samples_per_second,▇▅█▄█▄▇▅▇▄▁
eval/steps_per_second,▇▅█▄█▄▇▅▇▄▁
train/epoch,▁▂▃▃▄▅▆▆▆▇███
train/global_step,▁▂▃▃▄▅▆▆▆▇███
train/grad_norm,▁

0,1
eval/f1,0.69468
eval/loss,0.22841
eval/precision,0.80641
eval/recall,0.63885
eval/runtime,0.8436
eval/samples_per_second,308.22
eval/steps_per_second,39.12
total_flos,682803840384000.0
train/epoch,10.0
train/global_step,1300.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 7614/7614 [00:00<00:00, 16597.25 examples/s]
  4%|▍         | 1003/22845 [01:02<22:54, 15.89it/s]

{'loss': 0.1611, 'grad_norm': 0.3325408101081848, 'learning_rate': 4.781133727292624e-05, 'epoch': 0.66}


  7%|▋         | 1523/22845 [01:35<22:01, 16.14it/s]
  7%|▋         | 1523/22845 [01:41<22:01, 16.14it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.11084381490945816, 'eval_f1': 0.6492310932346846, 'eval_precision': 0.7789301738777731, 'eval_recall': 0.6242295121744385, 'eval_runtime': 5.8497, 'eval_samples_per_second': 260.356, 'eval_steps_per_second': 65.132, 'epoch': 1.0}


  9%|▉         | 2003/22845 [02:14<22:14, 15.62it/s]  

{'loss': 0.1116, 'grad_norm': 1.8992576599121094, 'learning_rate': 4.562267454585248e-05, 'epoch': 1.31}


 13%|█▎        | 3003/22845 [03:17<22:10, 14.92it/s]

{'loss': 0.0998, 'grad_norm': 1.768127202987671, 'learning_rate': 4.343401181877873e-05, 'epoch': 1.97}


 13%|█▎        | 3045/22845 [03:20<20:26, 16.14it/s]
 13%|█▎        | 3046/22845 [03:26<20:26, 16.14it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.10014191269874573, 'eval_f1': 0.7679403871696643, 'eval_precision': 0.8336455298860128, 'eval_recall': 0.7343393769984543, 'eval_runtime': 6.4189, 'eval_samples_per_second': 237.269, 'eval_steps_per_second': 59.356, 'epoch': 2.0}


 18%|█▊        | 4003/22845 [04:29<19:50, 15.82it/s]  

{'loss': 0.0779, 'grad_norm': 0.030443362891674042, 'learning_rate': 4.1245349091704974e-05, 'epoch': 2.63}


 20%|██        | 4569/22845 [05:05<20:10, 15.10it/s]
 20%|██        | 4569/22845 [05:12<20:10, 15.10it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.09067926555871964, 'eval_f1': 0.8212814567267772, 'eval_precision': 0.8424439508985818, 'eval_recall': 0.804577682805862, 'eval_runtime': 7.0744, 'eval_samples_per_second': 215.283, 'eval_steps_per_second': 53.856, 'epoch': 3.0}


 22%|██▏       | 5003/22845 [05:42<18:58, 15.67it/s]  

{'loss': 0.0677, 'grad_norm': 3.901862621307373, 'learning_rate': 3.9056686364631214e-05, 'epoch': 3.28}


 26%|██▋       | 6003/22845 [06:45<17:23, 16.14it/s]

{'loss': 0.0587, 'grad_norm': 1.4626305103302002, 'learning_rate': 3.6868023637557454e-05, 'epoch': 3.94}


 27%|██▋       | 6091/22845 [06:50<17:29, 15.96it/s]
 27%|██▋       | 6092/22845 [06:57<17:29, 15.96it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.10941781848669052, 'eval_f1': 0.8234640159257571, 'eval_precision': 0.8663325589897487, 'eval_recall': 0.789837654951206, 'eval_runtime': 6.5343, 'eval_samples_per_second': 233.078, 'eval_steps_per_second': 58.308, 'epoch': 4.0}


 31%|███       | 7003/22845 [07:57<17:10, 15.38it/s]  

{'loss': 0.0414, 'grad_norm': 0.015991317108273506, 'learning_rate': 3.467936091048369e-05, 'epoch': 4.6}


 33%|███▎      | 7615/22845 [08:36<15:52, 15.99it/s]
 33%|███▎      | 7615/22845 [08:42<15:52, 15.99it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.10832557827234268, 'eval_f1': 0.8193985240968432, 'eval_precision': 0.8348581810387269, 'eval_recall': 0.8074005971434965, 'eval_runtime': 6.0972, 'eval_samples_per_second': 249.786, 'eval_steps_per_second': 62.487, 'epoch': 5.0}


 35%|███▌      | 8003/22845 [09:08<15:50, 15.62it/s]  

{'loss': 0.0425, 'grad_norm': 0.007733296602964401, 'learning_rate': 3.249069818340994e-05, 'epoch': 5.25}


 39%|███▉      | 9001/22845 [10:11<15:20, 15.04it/s]

{'loss': 0.0321, 'grad_norm': 9.187843322753906, 'learning_rate': 3.030203545633618e-05, 'epoch': 5.91}


 40%|███▉      | 9137/22845 [10:20<13:54, 16.42it/s]
 40%|████      | 9138/22845 [10:27<13:54, 16.42it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.1276833564043045, 'eval_f1': 0.8128157032465236, 'eval_precision': 0.8317564537973893, 'eval_recall': 0.797739694966778, 'eval_runtime': 6.2377, 'eval_samples_per_second': 244.161, 'eval_steps_per_second': 61.08, 'epoch': 6.0}


 44%|████▍     | 10003/22845 [11:24<13:23, 15.97it/s] 

{'loss': 0.0276, 'grad_norm': 0.015057794749736786, 'learning_rate': 2.8113372729262422e-05, 'epoch': 6.57}


 47%|████▋     | 10661/22845 [12:05<13:20, 15.22it/s]
 47%|████▋     | 10661/22845 [12:12<13:20, 15.22it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.11954731494188309, 'eval_f1': 0.8567114242659036, 'eval_precision': 0.8701219951019173, 'eval_recall': 0.8456170692657049, 'eval_runtime': 7.343, 'eval_samples_per_second': 207.409, 'eval_steps_per_second': 51.886, 'epoch': 7.0}


 48%|████▊     | 11003/22845 [12:36<12:40, 15.57it/s]  

{'loss': 0.0281, 'grad_norm': 0.005963919684290886, 'learning_rate': 2.5924710002188662e-05, 'epoch': 7.22}


 53%|█████▎    | 12003/22845 [13:38<11:25, 15.82it/s]

{'loss': 0.0195, 'grad_norm': 0.009913146495819092, 'learning_rate': 2.3736047275114905e-05, 'epoch': 7.88}


 53%|█████▎    | 12183/22845 [13:50<10:25, 17.05it/s]
 53%|█████▎    | 12184/22845 [13:56<10:25, 17.05it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.1471843421459198, 'eval_f1': 0.8219376660714188, 'eval_precision': 0.8406370569701963, 'eval_recall': 0.8138597577591481, 'eval_runtime': 6.5095, 'eval_samples_per_second': 233.967, 'eval_steps_per_second': 58.53, 'epoch': 8.0}


 57%|█████▋    | 13003/22845 [14:49<10:48, 15.17it/s]  

{'loss': 0.0144, 'grad_norm': 0.16208408772945404, 'learning_rate': 2.1547384548041148e-05, 'epoch': 8.54}


 60%|██████    | 13707/22845 [15:35<10:15, 14.84it/s]
 60%|██████    | 13707/22845 [15:41<10:15, 14.84it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.1447901725769043, 'eval_f1': 0.8439645621516749, 'eval_precision': 0.8399386139760242, 'eval_recall': 0.8515072127902157, 'eval_runtime': 5.9524, 'eval_samples_per_second': 255.863, 'eval_steps_per_second': 64.008, 'epoch': 9.0}


 61%|██████▏   | 14003/22845 [16:02<09:48, 15.02it/s]  

{'loss': 0.0126, 'grad_norm': 0.008744856342673302, 'learning_rate': 1.935872182096739e-05, 'epoch': 9.19}


 66%|██████▌   | 15003/22845 [17:05<08:21, 15.63it/s]

{'loss': 0.0117, 'grad_norm': 0.024110374972224236, 'learning_rate': 1.717005909389363e-05, 'epoch': 9.85}


 67%|██████▋   | 15229/22845 [17:19<07:43, 16.45it/s]
 67%|██████▋   | 15230/22845 [17:26<07:43, 16.45it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.13178174197673798, 'eval_f1': 0.851337386195481, 'eval_precision': 0.8591047781604711, 'eval_recall': 0.8476675031719934, 'eval_runtime': 6.2152, 'eval_samples_per_second': 245.045, 'eval_steps_per_second': 61.302, 'epoch': 10.0}


 70%|███████   | 16003/22845 [18:16<06:59, 16.29it/s]  

{'loss': 0.0073, 'grad_norm': 0.004774145781993866, 'learning_rate': 1.4981396366819875e-05, 'epoch': 10.51}


 73%|███████▎  | 16753/22845 [19:03<07:02, 14.43it/s]
 73%|███████▎  | 16753/22845 [19:10<07:02, 14.43it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.13862943649291992, 'eval_f1': 0.852465468109564, 'eval_precision': 0.8751025778368431, 'eval_recall': 0.8328960335998392, 'eval_runtime': 6.9265, 'eval_samples_per_second': 219.882, 'eval_steps_per_second': 55.006, 'epoch': 11.0}


 74%|███████▍  | 17003/22845 [19:27<06:12, 15.67it/s]  

{'loss': 0.007, 'grad_norm': 0.001335371402092278, 'learning_rate': 1.2792733639746115e-05, 'epoch': 11.16}


 79%|███████▉  | 18003/22845 [20:30<05:04, 15.90it/s]

{'loss': 0.0076, 'grad_norm': 0.0023039942607283592, 'learning_rate': 1.0604070912672358e-05, 'epoch': 11.82}


 80%|███████▉  | 18275/22845 [20:47<05:02, 15.11it/s]
 80%|████████  | 18276/22845 [20:53<05:02, 15.11it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.14610664546489716, 'eval_f1': 0.853251748675697, 'eval_precision': 0.8763338654022833, 'eval_recall': 0.8342267244099829, 'eval_runtime': 6.7061, 'eval_samples_per_second': 227.106, 'eval_steps_per_second': 56.814, 'epoch': 12.0}


 83%|████████▎ | 19003/22845 [21:41<04:10, 15.34it/s]  

{'loss': 0.0035, 'grad_norm': 0.041132815182209015, 'learning_rate': 8.4154081855986e-06, 'epoch': 12.48}


 87%|████████▋ | 19799/22845 [22:32<03:10, 16.00it/s]
 87%|████████▋ | 19799/22845 [22:38<03:10, 16.00it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.15286381542682648, 'eval_f1': 0.840699441967503, 'eval_precision': 0.8540016506532953, 'eval_recall': 0.8310567423483987, 'eval_runtime': 6.0473, 'eval_samples_per_second': 251.847, 'eval_steps_per_second': 63.003, 'epoch': 13.0}


 88%|████████▊ | 20003/22845 [22:53<03:03, 15.49it/s]  

{'loss': 0.0038, 'grad_norm': 0.024911383166909218, 'learning_rate': 6.2267454585248415e-06, 'epoch': 13.13}


 92%|█████████▏| 21003/22845 [23:56<02:01, 15.10it/s]

{'loss': 0.0042, 'grad_norm': 0.0013514080783352256, 'learning_rate': 4.038082731451084e-06, 'epoch': 13.79}


 93%|█████████▎| 21321/22845 [24:16<01:34, 16.06it/s]
 93%|█████████▎| 21322/22845 [24:23<01:34, 16.06it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.14817935228347778, 'eval_f1': 0.8571621502276932, 'eval_precision': 0.8623635331028826, 'eval_recall': 0.8525724575740921, 'eval_runtime': 6.393, 'eval_samples_per_second': 238.229, 'eval_steps_per_second': 59.596, 'epoch': 14.0}


 96%|█████████▋| 22003/22845 [25:07<00:57, 14.74it/s]

{'loss': 0.0016, 'grad_norm': 0.05173032358288765, 'learning_rate': 1.8494200043773256e-06, 'epoch': 14.45}


100%|██████████| 22845/22845 [26:00<00:00, 15.73it/s]
100%|██████████| 22845/22845 [26:08<00:00, 15.73it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.1533026397228241, 'eval_f1': 0.8559259057564066, 'eval_precision': 0.8600939530084342, 'eval_recall': 0.852404587841005, 'eval_runtime': 6.8668, 'eval_samples_per_second': 221.793, 'eval_steps_per_second': 55.485, 'epoch': 15.0}


100%|██████████| 22845/22845 [26:10<00:00, 14.54it/s]


{'train_runtime': 1570.8061, 'train_samples_per_second': 58.164, 'train_steps_per_second': 14.543, 'train_loss': 0.03690393449545794, 'epoch': 15.0}


100%|██████████| 381/381 [00:05<00:00, 68.51it/s]



 ** Evaluating metrics for java test set. ** 

Evaluation Metrics: {'eval_loss': 0.14817935228347778, 'eval_f1': 0.8571621502276932, 'eval_precision': 0.8623635331028826, 'eval_recall': 0.8525724575740921, 'eval_runtime': 5.5931, 'eval_samples_per_second': 272.299, 'eval_steps_per_second': 68.119, 'epoch': 15.0}


0,1
eval/f1,▁▅▇▇▇▇█▇████▇███
eval/loss,▃▂▁▃▃▅▄▇▇▆▆▇█▇█▇
eval/precision,▁▅▆▇▅▅█▅▅▇██▆▇▇▇
eval/recall,▁▄▇▆▇▆█▇██▇▇▇███
eval/runtime,▂▄▇▅▃▄█▅▂▃▆▅▃▄▆▁
eval/samples_per_second,▇▄▂▄▆▅▁▄▆▅▂▃▆▄▃█
eval/steps_per_second,▇▄▂▄▆▅▁▄▆▅▂▃▆▄▃█
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/grad_norm,▁▂▂▁▄▂▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/f1,0.85716
eval/loss,0.14818
eval/precision,0.86236
eval/recall,0.85257
eval/runtime,5.5931
eval/samples_per_second,272.299
eval/steps_per_second,68.119
total_flos,6010055190432000.0
train/epoch,15.0
train/global_step,22845.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1884/1884 [00:00<00:00, 10009.85 examples/s]
  7%|▋         | 377/5655 [00:24<05:04, 17.31it/s]
  7%|▋         | 377/5655 [00:25<05:04, 17.31it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.31516456604003906, 'eval_f1': 0.4516626639061422, 'eval_precision': 0.5214957012236728, 'eval_recall': 0.3993407433164594, 'eval_runtime': 1.688, 'eval_samples_per_second': 223.344, 'eval_steps_per_second': 56.28, 'epoch': 1.0}


 13%|█▎        | 753/5655 [00:50<05:07, 15.95it/s]
 13%|█▎        | 754/5655 [00:52<05:07, 15.95it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.3311464786529541, 'eval_f1': 0.5371053258123825, 'eval_precision': 0.6012222222222221, 'eval_recall': 0.5021425746992912, 'eval_runtime': 1.5441, 'eval_samples_per_second': 244.152, 'eval_steps_per_second': 61.524, 'epoch': 2.0}


 18%|█▊        | 1003/5655 [01:09<04:55, 15.77it/s]

{'loss': 0.3376, 'grad_norm': 2.7655694484710693, 'learning_rate': 4.1158267020335985e-05, 'epoch': 2.65}


 20%|██        | 1131/5655 [01:17<04:23, 17.19it/s]
 20%|██        | 1131/5655 [01:19<04:23, 17.19it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.31990358233451843, 'eval_f1': 0.6618431682472778, 'eval_precision': 0.8159988468477948, 'eval_recall': 0.6050001933742255, 'eval_runtime': 1.5169, 'eval_samples_per_second': 248.531, 'eval_steps_per_second': 62.627, 'epoch': 3.0}


 27%|██▋       | 1507/5655 [01:44<04:11, 16.51it/s]
 27%|██▋       | 1508/5655 [01:46<04:11, 16.51it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.332945853471756, 'eval_f1': 0.7111219611302848, 'eval_precision': 0.7951220798671594, 'eval_recall': 0.6608253352581686, 'eval_runtime': 1.5677, 'eval_samples_per_second': 240.48, 'eval_steps_per_second': 60.598, 'epoch': 4.0}


 33%|███▎      | 1885/5655 [02:11<03:51, 16.26it/s]
 33%|███▎      | 1885/5655 [02:13<03:51, 16.26it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.3798815906047821, 'eval_f1': 0.7105963687263267, 'eval_precision': 0.7550464178074954, 'eval_recall': 0.6846583593099375, 'eval_runtime': 1.5886, 'eval_samples_per_second': 237.313, 'eval_steps_per_second': 59.8, 'epoch': 5.0}


 35%|███▌      | 2003/5655 [02:22<03:57, 15.36it/s]

{'loss': 0.1363, 'grad_norm': 0.09172403067350388, 'learning_rate': 3.2316534040671975e-05, 'epoch': 5.31}


 40%|███▉      | 2261/5655 [02:38<03:19, 17.02it/s]
 40%|████      | 2262/5655 [02:40<03:19, 17.02it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.4240279197692871, 'eval_f1': 0.7253077197551555, 'eval_precision': 0.7575248210192768, 'eval_recall': 0.7029728656114763, 'eval_runtime': 1.5694, 'eval_samples_per_second': 240.22, 'eval_steps_per_second': 60.533, 'epoch': 6.0}


 47%|████▋     | 2639/5655 [03:05<03:05, 16.23it/s]
 47%|████▋     | 2639/5655 [03:07<03:05, 16.23it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.4303051233291626, 'eval_f1': 0.730535582706135, 'eval_precision': 0.7422169509540574, 'eval_recall': 0.7226340745543193, 'eval_runtime': 1.6059, 'eval_samples_per_second': 234.756, 'eval_steps_per_second': 59.156, 'epoch': 7.0}


 53%|█████▎    | 3003/5655 [03:31<02:50, 15.53it/s]

{'loss': 0.0521, 'grad_norm': 0.04735327512025833, 'learning_rate': 2.347480106100796e-05, 'epoch': 7.96}


 53%|█████▎    | 3015/5655 [03:32<02:44, 16.03it/s]
 53%|█████▎    | 3016/5655 [03:34<02:44, 16.03it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.4395894706249237, 'eval_f1': 0.7362242748448355, 'eval_precision': 0.7498362206582871, 'eval_recall': 0.7323337925092217, 'eval_runtime': 1.6031, 'eval_samples_per_second': 235.164, 'eval_steps_per_second': 59.259, 'epoch': 8.0}


 60%|██████    | 3393/5655 [03:59<02:16, 16.59it/s]
 60%|██████    | 3393/5655 [04:01<02:16, 16.59it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.4536871016025543, 'eval_f1': 0.7643761778629841, 'eval_precision': 0.7626608174434261, 'eval_recall': 0.7677817444886588, 'eval_runtime': 1.6164, 'eval_samples_per_second': 233.231, 'eval_steps_per_second': 58.772, 'epoch': 9.0}


 67%|██████▋   | 3769/5655 [04:26<02:00, 15.62it/s]
 67%|██████▋   | 3770/5655 [04:28<02:00, 15.62it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.49923962354660034, 'eval_f1': 0.7619170944378327, 'eval_precision': 0.7524672820404528, 'eval_recall': 0.7738530875066421, 'eval_runtime': 1.6164, 'eval_samples_per_second': 233.24, 'eval_steps_per_second': 58.774, 'epoch': 10.0}


 71%|███████   | 4003/5655 [04:44<01:40, 16.39it/s]

{'loss': 0.0199, 'grad_norm': 0.08224429935216904, 'learning_rate': 1.4633068081343945e-05, 'epoch': 10.61}


 73%|███████▎  | 4147/5655 [04:53<01:43, 14.61it/s]
 73%|███████▎  | 4147/5655 [04:55<01:43, 14.61it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.472241073846817, 'eval_f1': 0.7614440920990179, 'eval_precision': 0.7643978317857606, 'eval_recall': 0.7602046344652875, 'eval_runtime': 1.6857, 'eval_samples_per_second': 223.64, 'eval_steps_per_second': 56.355, 'epoch': 11.0}


 80%|███████▉  | 4523/5655 [05:21<01:10, 16.08it/s]
 80%|████████  | 4524/5655 [05:22<01:10, 16.08it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.49951818585395813, 'eval_f1': 0.7611856060397095, 'eval_precision': 0.7541173526455464, 'eval_recall': 0.7722541152965107, 'eval_runtime': 1.6877, 'eval_samples_per_second': 223.381, 'eval_steps_per_second': 56.29, 'epoch': 12.0}


 87%|████████▋ | 4901/5655 [05:48<00:48, 15.55it/s]
 87%|████████▋ | 4901/5655 [05:50<00:48, 15.55it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.49406683444976807, 'eval_f1': 0.7712859567538934, 'eval_precision': 0.7720189257714292, 'eval_recall': 0.7724765220250899, 'eval_runtime': 1.7235, 'eval_samples_per_second': 218.746, 'eval_steps_per_second': 55.122, 'epoch': 13.0}


 88%|████████▊ | 5003/5655 [05:58<00:40, 15.90it/s]

{'loss': 0.0055, 'grad_norm': 0.021688014268875122, 'learning_rate': 5.7913351016799295e-06, 'epoch': 13.26}


 93%|█████████▎| 5277/5655 [06:15<00:25, 14.81it/s]
 93%|█████████▎| 5278/5655 [06:17<00:25, 14.81it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.5060080289840698, 'eval_f1': 0.7688670256467869, 'eval_precision': 0.7644105195420984, 'eval_recall': 0.7749506046050255, 'eval_runtime': 2.1007, 'eval_samples_per_second': 179.464, 'eval_steps_per_second': 45.223, 'epoch': 14.0}


100%|██████████| 5655/5655 [06:43<00:00, 15.34it/s]
100%|██████████| 5655/5655 [06:46<00:00, 15.34it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.5067896246910095, 'eval_f1': 0.7680649102582175, 'eval_precision': 0.7698968533751143, 'eval_recall': 0.7683323131377982, 'eval_runtime': 1.6981, 'eval_samples_per_second': 222.009, 'eval_steps_per_second': 55.944, 'epoch': 15.0}


100%|██████████| 5655/5655 [06:48<00:00, 13.84it/s]


{'train_runtime': 408.6347, 'train_samples_per_second': 55.318, 'train_steps_per_second': 13.839, 'train_loss': 0.09777756650083155, 'epoch': 15.0}


100%|██████████| 95/95 [00:01<00:00, 58.55it/s]



 ** Evaluating metrics for python test set. ** 

Evaluation Metrics: {'eval_loss': 0.49406683444976807, 'eval_f1': 0.7712859567538934, 'eval_precision': 0.7720189257714292, 'eval_recall': 0.7724765220250899, 'eval_runtime': 1.6592, 'eval_samples_per_second': 227.216, 'eval_steps_per_second': 57.256, 'epoch': 15.0}


0,1
eval/f1,▁▃▆▇▇▇▇▇████████
eval/loss,▁▂▁▂▃▅▅▆▆█▇█████
eval/precision,▁▃██▇▇▆▆▇▆▇▇▇▇▇▇
eval/recall,▁▃▅▆▆▇▇▇████████
eval/runtime,▃▁▁▂▂▂▂▂▂▂▃▃▃█▃▃
eval/samples_per_second,▅██▇▇▇▇▇▆▆▅▅▅▁▅▆
eval/steps_per_second,▅██▇▇▇▇▇▆▆▅▅▅▁▅▆
train/epoch,▁▁▂▂▃▃▃▃▄▄▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▅▆▆▇▇▇▇███
train/grad_norm,█▁▁▁▁

0,1
eval/f1,0.77129
eval/loss,0.49407
eval/precision,0.77202
eval/recall,0.77248
eval/runtime,1.6592
eval/samples_per_second,227.216
eval/steps_per_second,57.256
total_flos,1486946402599680.0
train/epoch,15.0
train/global_step,5655.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1298/1298 [00:00<00:00, 3738.15 examples/s]
  7%|▋         | 259/3900 [00:16<04:04, 14.89it/s]
  7%|▋         | 260/3900 [00:17<04:04, 14.89it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.3063732385635376, 'eval_f1': 0.18135467980295567, 'eval_precision': 0.3287313287313287, 'eval_recall': 0.17039453489421888, 'eval_runtime': 0.9384, 'eval_samples_per_second': 277.082, 'eval_steps_per_second': 69.27, 'epoch': 1.0}


 13%|█▎        | 519/3900 [00:35<03:26, 16.35it/s]
 13%|█▎        | 520/3900 [00:36<03:26, 16.35it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2488754242658615, 'eval_f1': 0.4411457033637484, 'eval_precision': 0.5890293916609706, 'eval_recall': 0.3621452409198686, 'eval_runtime': 1.0287, 'eval_samples_per_second': 252.739, 'eval_steps_per_second': 63.185, 'epoch': 2.0}


 20%|█▉        | 779/3900 [00:54<03:14, 16.01it/s]
 20%|██        | 780/3900 [00:55<03:14, 16.01it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2529616355895996, 'eval_f1': 0.4385414907797712, 'eval_precision': 0.5531757403422198, 'eval_recall': 0.403096350222814, 'eval_runtime': 1.0691, 'eval_samples_per_second': 243.192, 'eval_steps_per_second': 60.798, 'epoch': 3.0}


 26%|██▌       | 1003/3900 [01:11<03:04, 15.72it/s]

{'loss': 0.2334, 'grad_norm': 1.3214563131332397, 'learning_rate': 3.717948717948718e-05, 'epoch': 3.85}


 27%|██▋       | 1039/3900 [01:13<03:06, 15.30it/s]
 27%|██▋       | 1040/3900 [01:15<03:06, 15.30it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2262565940618515, 'eval_f1': 0.599112929557396, 'eval_precision': 0.6646286380905192, 'eval_recall': 0.5511883639400345, 'eval_runtime': 1.2779, 'eval_samples_per_second': 203.465, 'eval_steps_per_second': 50.866, 'epoch': 4.0}


 33%|███▎      | 1299/3900 [01:33<02:45, 15.73it/s]
 33%|███▎      | 1300/3900 [01:34<02:45, 15.73it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2670806348323822, 'eval_f1': 0.5879814600024214, 'eval_precision': 0.8628384203439688, 'eval_recall': 0.5384294428025886, 'eval_runtime': 0.9824, 'eval_samples_per_second': 264.655, 'eval_steps_per_second': 66.164, 'epoch': 5.0}


 40%|███▉      | 1559/3900 [01:52<02:25, 16.08it/s]
 40%|████      | 1560/3900 [01:53<02:25, 16.08it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2513812184333801, 'eval_f1': 0.6133046550382024, 'eval_precision': 0.8085017512159504, 'eval_recall': 0.5527074849542225, 'eval_runtime': 1.1244, 'eval_samples_per_second': 231.234, 'eval_steps_per_second': 57.809, 'epoch': 6.0}


 47%|████▋     | 1819/3900 [02:11<02:09, 16.08it/s]
 47%|████▋     | 1820/3900 [02:12<02:09, 16.08it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.24446925520896912, 'eval_f1': 0.6990906893502445, 'eval_precision': 0.8431608357628766, 'eval_recall': 0.6502085572956133, 'eval_runtime': 1.1805, 'eval_samples_per_second': 220.249, 'eval_steps_per_second': 55.062, 'epoch': 7.0}


 51%|█████▏    | 2003/3900 [02:25<02:05, 15.09it/s]

{'loss': 0.0718, 'grad_norm': 0.04409731552004814, 'learning_rate': 2.435897435897436e-05, 'epoch': 7.69}


 53%|█████▎    | 2079/3900 [02:30<01:53, 16.01it/s]
 53%|█████▎    | 2080/3900 [02:31<01:53, 16.01it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.25965821743011475, 'eval_f1': 0.6539823459892192, 'eval_precision': 0.7268505252057883, 'eval_recall': 0.6147889540879495, 'eval_runtime': 0.9786, 'eval_samples_per_second': 265.693, 'eval_steps_per_second': 66.423, 'epoch': 8.0}


 60%|█████▉    | 2339/3900 [02:49<01:34, 16.43it/s]
 60%|██████    | 2340/3900 [02:50<01:34, 16.43it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.28283175826072693, 'eval_f1': 0.6444835140426924, 'eval_precision': 0.7329144238366789, 'eval_recall': 0.6017325303387573, 'eval_runtime': 1.0189, 'eval_samples_per_second': 255.168, 'eval_steps_per_second': 63.792, 'epoch': 9.0}


 67%|██████▋   | 2599/3900 [03:09<01:23, 15.61it/s]
 67%|██████▋   | 2600/3900 [03:10<01:23, 15.61it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.28907275199890137, 'eval_f1': 0.6908479836438742, 'eval_precision': 0.7706460206460207, 'eval_recall': 0.65302504734556, 'eval_runtime': 1.0859, 'eval_samples_per_second': 239.434, 'eval_steps_per_second': 59.859, 'epoch': 10.0}


 73%|███████▎  | 2859/3900 [03:28<01:02, 16.68it/s]
 73%|███████▎  | 2860/3900 [03:29<01:02, 16.68it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.28628990054130554, 'eval_f1': 0.686605863108022, 'eval_precision': 0.7419872191300764, 'eval_recall': 0.648185642866684, 'eval_runtime': 0.9781, 'eval_samples_per_second': 265.814, 'eval_steps_per_second': 66.454, 'epoch': 11.0}


 77%|███████▋  | 3003/3900 [03:40<00:59, 15.10it/s]

{'loss': 0.0252, 'grad_norm': 0.0244907233864069, 'learning_rate': 1.153846153846154e-05, 'epoch': 11.54}


 80%|███████▉  | 3119/3900 [03:47<00:50, 15.37it/s]
 80%|████████  | 3120/3900 [03:48<00:50, 15.37it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.29149433970451355, 'eval_f1': 0.6820925562914752, 'eval_precision': 0.7396951894620729, 'eval_recall': 0.6497748743039397, 'eval_runtime': 1.0379, 'eval_samples_per_second': 250.51, 'eval_steps_per_second': 62.628, 'epoch': 12.0}


 87%|████████▋ | 3379/3900 [04:06<00:31, 16.48it/s]
 87%|████████▋ | 3380/3900 [04:07<00:31, 16.48it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2803678512573242, 'eval_f1': 0.7060660949103367, 'eval_precision': 0.7628543374515447, 'eval_recall': 0.6719622063675847, 'eval_runtime': 1.0563, 'eval_samples_per_second': 246.134, 'eval_steps_per_second': 61.534, 'epoch': 13.0}


 93%|█████████▎| 3639/3900 [04:25<00:17, 14.69it/s]
 93%|█████████▎| 3640/3900 [04:27<00:17, 14.69it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2870853841304779, 'eval_f1': 0.7022800094989124, 'eval_precision': 0.76000111000111, 'eval_recall': 0.6687403218691044, 'eval_runtime': 1.2978, 'eval_samples_per_second': 200.34, 'eval_steps_per_second': 50.085, 'epoch': 14.0}


100%|█████████▉| 3899/3900 [04:45<00:00, 16.66it/s]
100%|██████████| 3900/3900 [04:48<00:00, 16.66it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2923307418823242, 'eval_f1': 0.700610378236431, 'eval_precision': 0.7589147780324251, 'eval_recall': 0.6681961041820296, 'eval_runtime': 1.0079, 'eval_samples_per_second': 257.95, 'eval_steps_per_second': 64.488, 'epoch': 15.0}


100%|██████████| 3900/3900 [04:50<00:00, 13.42it/s]


{'train_runtime': 290.6435, 'train_samples_per_second': 53.571, 'train_steps_per_second': 13.419, 'train_loss': 0.08669038674770257, 'epoch': 15.0}


100%|██████████| 65/65 [00:01<00:00, 50.12it/s]



 ** Evaluating metrics for pharo test set. ** 

Evaluation Metrics: {'eval_loss': 0.2803678512573242, 'eval_f1': 0.7060660949103367, 'eval_precision': 0.7628543374515447, 'eval_recall': 0.6719622063675847, 'eval_runtime': 1.3308, 'eval_samples_per_second': 195.376, 'eval_steps_per_second': 48.844, 'epoch': 15.0}
    comparison                                     model     lan  \
0   validation  epoch-15_batchSize-4_weightsOfDecay-0.01    java   
1   validation  epoch-15_batchSize-4_weightsOfDecay-0.01    java   
2   validation  epoch-15_batchSize-4_weightsOfDecay-0.01    java   
3   validation  epoch-15_batchSize-4_weightsOfDecay-0.01    java   
4   validation  epoch-15_batchSize-4_weightsOfDecay-0.01    java   
5   validation  epoch-15_batchSize-4_weightsOfDecay-0.01    java   
6   validation  epoch-15_batchSize-4_weightsOfDecay-0.01    java   
7   validation  epoch-15_batchSize-4_weightsOfDecay-0.01  python   
8   validation  epoch-15_batchSize-4_weightsOfDecay-0.01  python   
9   v

0,1
eval/f1,▁▄▄▇▆▇█▇▇███████
eval/loss,█▃▃▁▅▃▃▄▆▆▆▇▆▆▇▆
eval/precision,▁▄▄▅█▇█▆▆▇▆▆▇▇▇▇
eval/recall,▁▄▄▆▆▆█▇▇███████
eval/runtime,▁▃▃▇▂▄▅▂▂▄▂▃▃▇▂█
eval/samples_per_second,█▆▅▂▇▄▃▇▆▅▇▆▅▁▆▁
eval/steps_per_second,█▆▅▂▇▄▃▇▆▅▇▆▅▁▆▁
train/epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇███
train/global_step,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇███
train/grad_norm,█▁▁

0,1
eval/f1,0.70607
eval/loss,0.28037
eval/precision,0.76285
eval/recall,0.67196
eval/runtime,1.3308
eval/samples_per_second,195.376
eval/steps_per_second,48.844
total_flos,1024205760576000.0
train/epoch,15.0
train/global_step,3900.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 7614/7614 [00:00<00:00, 16338.53 examples/s]
  7%|▋         | 761/11430 [00:56<13:15, 13.41it/s]
  7%|▋         | 762/11430 [01:00<13:15, 13.41it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.11700481176376343, 'eval_f1': 0.6384419035146587, 'eval_precision': 0.655397790563792, 'eval_recall': 0.6273599055553943, 'eval_runtime': 3.8238, 'eval_samples_per_second': 398.3, 'eval_steps_per_second': 49.951, 'epoch': 1.0}


  9%|▉         | 1002/11430 [01:19<13:25, 12.94it/s] 

{'loss': 0.1304, 'grad_norm': 3.15448260307312, 'learning_rate': 4.562554680664917e-05, 'epoch': 1.31}


 13%|█▎        | 1524/11430 [01:58<12:16, 13.45it/s]
 13%|█▎        | 1524/11430 [02:02<12:16, 13.45it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.08623529970645905, 'eval_f1': 0.7916186891702456, 'eval_precision': 0.8787048895456192, 'eval_recall': 0.7405206025208717, 'eval_runtime': 4.0488, 'eval_samples_per_second': 376.163, 'eval_steps_per_second': 47.175, 'epoch': 2.0}


 18%|█▊        | 2001/11430 [02:40<11:56, 13.16it/s]  

{'loss': 0.0745, 'grad_norm': 0.04900272935628891, 'learning_rate': 4.125109361329834e-05, 'epoch': 2.62}


 20%|█▉        | 2285/11430 [03:01<11:23, 13.38it/s]
 20%|██        | 2286/11430 [03:05<11:23, 13.38it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.09296096861362457, 'eval_f1': 0.8313916241873213, 'eval_precision': 0.8382751483825316, 'eval_recall': 0.8269447613158555, 'eval_runtime': 4.0131, 'eval_samples_per_second': 379.505, 'eval_steps_per_second': 47.594, 'epoch': 3.0}


 26%|██▋       | 3002/11430 [04:02<10:40, 13.16it/s]  

{'loss': 0.0492, 'grad_norm': 1.1734230518341064, 'learning_rate': 3.6876640419947505e-05, 'epoch': 3.94}


 27%|██▋       | 3048/11430 [04:05<10:22, 13.46it/s]
 27%|██▋       | 3048/11430 [04:09<10:22, 13.46it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.09235883504152298, 'eval_f1': 0.8438790225329642, 'eval_precision': 0.8767866992313017, 'eval_recall': 0.8198283319622474, 'eval_runtime': 4.1213, 'eval_samples_per_second': 369.548, 'eval_steps_per_second': 46.345, 'epoch': 4.0}


 33%|███▎      | 3810/11430 [05:09<09:08, 13.90it/s]  
 33%|███▎      | 3810/11430 [05:13<09:08, 13.90it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.09970040619373322, 'eval_f1': 0.8535481164766842, 'eval_precision': 0.86607452782873, 'eval_recall': 0.8425144669092134, 'eval_runtime': 3.9787, 'eval_samples_per_second': 382.792, 'eval_steps_per_second': 48.006, 'epoch': 5.0}


 35%|███▌      | 4002/11430 [05:30<09:24, 13.16it/s]  

{'loss': 0.0278, 'grad_norm': 0.29567044973373413, 'learning_rate': 3.2502187226596675e-05, 'epoch': 5.25}


 40%|████      | 4572/11430 [06:13<08:11, 13.95it/s]
 40%|████      | 4572/11430 [06:17<08:11, 13.95it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.11564214527606964, 'eval_f1': 0.8150009362616304, 'eval_precision': 0.8569777084117522, 'eval_recall': 0.7956479749010948, 'eval_runtime': 4.1413, 'eval_samples_per_second': 367.763, 'eval_steps_per_second': 46.121, 'epoch': 6.0}


 44%|████▍     | 5002/11430 [06:52<08:08, 13.15it/s]  

{'loss': 0.017, 'grad_norm': 1.364526391029358, 'learning_rate': 2.8127734033245845e-05, 'epoch': 6.56}


 47%|████▋     | 5334/11430 [07:17<07:12, 14.09it/s]
 47%|████▋     | 5334/11430 [07:21<07:12, 14.09it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.10965515673160553, 'eval_f1': 0.8618082486199156, 'eval_precision': 0.9069638722304213, 'eval_recall': 0.8404615721674353, 'eval_runtime': 3.9678, 'eval_samples_per_second': 383.839, 'eval_steps_per_second': 48.137, 'epoch': 7.0}


 53%|█████▎    | 6002/11430 [08:13<06:54, 13.10it/s]  

{'loss': 0.0114, 'grad_norm': 0.005783134140074253, 'learning_rate': 2.3753280839895015e-05, 'epoch': 7.87}


 53%|█████▎    | 6096/11430 [08:20<06:21, 13.97it/s]
 53%|█████▎    | 6096/11430 [08:24<06:21, 13.97it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.1213071420788765, 'eval_f1': 0.8579397642767219, 'eval_precision': 0.8715286568500493, 'eval_recall': 0.8527936885820084, 'eval_runtime': 4.088, 'eval_samples_per_second': 372.55, 'eval_steps_per_second': 46.722, 'epoch': 8.0}


 60%|██████    | 6858/11430 [09:24<05:22, 14.16it/s]  
 60%|██████    | 6858/11430 [09:28<05:22, 14.16it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.130671888589859, 'eval_f1': 0.8578541906128679, 'eval_precision': 0.8825564177768488, 'eval_recall': 0.8385360749532048, 'eval_runtime': 3.9197, 'eval_samples_per_second': 388.549, 'eval_steps_per_second': 48.728, 'epoch': 9.0}


 61%|██████▏   | 7002/11430 [09:41<05:37, 13.13it/s]  

{'loss': 0.0072, 'grad_norm': 0.2394903600215912, 'learning_rate': 1.9378827646544184e-05, 'epoch': 9.19}


 67%|██████▋   | 7620/11430 [10:27<04:35, 13.82it/s]
 67%|██████▋   | 7620/11430 [10:31<04:35, 13.82it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.12724924087524414, 'eval_f1': 0.8668294671511206, 'eval_precision': 0.8691352052828546, 'eval_recall': 0.8681339518579678, 'eval_runtime': 4.0008, 'eval_samples_per_second': 380.672, 'eval_steps_per_second': 47.74, 'epoch': 10.0}


 70%|███████   | 8001/11430 [11:02<04:31, 12.61it/s]  

{'loss': 0.0059, 'grad_norm': 0.05227774381637573, 'learning_rate': 1.500437445319335e-05, 'epoch': 10.5}


 73%|███████▎  | 8381/11430 [11:31<03:49, 13.29it/s]
 73%|███████▎  | 8382/11430 [11:35<03:49, 13.29it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.12514810264110565, 'eval_f1': 0.8678764646845367, 'eval_precision': 0.8801444835202298, 'eval_recall': 0.8571586448228181, 'eval_runtime': 3.9361, 'eval_samples_per_second': 386.929, 'eval_steps_per_second': 48.525, 'epoch': 11.0}


 79%|███████▊  | 9001/11430 [12:23<03:04, 13.14it/s]

{'loss': 0.0047, 'grad_norm': 0.006345884874463081, 'learning_rate': 1.062992125984252e-05, 'epoch': 11.81}


 80%|███████▉  | 9143/11430 [12:34<02:49, 13.51it/s]
 80%|████████  | 9144/11430 [12:38<02:49, 13.51it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.1326802670955658, 'eval_f1': 0.858914960516364, 'eval_precision': 0.8729174158431074, 'eval_recall': 0.8465985491234035, 'eval_runtime': 3.8908, 'eval_samples_per_second': 391.433, 'eval_steps_per_second': 49.09, 'epoch': 12.0}


 87%|████████▋ | 9906/11430 [13:37<01:56, 13.06it/s]
 87%|████████▋ | 9906/11430 [13:41<01:56, 13.06it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.12786667048931122, 'eval_f1': 0.8572720748285331, 'eval_precision': 0.869384786289442, 'eval_recall': 0.8486247112557134, 'eval_runtime': 3.9988, 'eval_samples_per_second': 380.867, 'eval_steps_per_second': 47.765, 'epoch': 13.0}


 88%|████████▊ | 10002/11430 [13:51<01:48, 13.13it/s]

{'loss': 0.0027, 'grad_norm': 0.014841370284557343, 'learning_rate': 6.255468066491689e-06, 'epoch': 13.12}


 93%|█████████▎| 10668/11430 [14:41<00:56, 13.39it/s]
 93%|█████████▎| 10668/11430 [14:45<00:56, 13.39it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.13190801441669464, 'eval_f1': 0.867773029429845, 'eval_precision': 0.8771097161052449, 'eval_recall': 0.8592709977937754, 'eval_runtime': 3.9297, 'eval_samples_per_second': 387.559, 'eval_steps_per_second': 48.604, 'epoch': 14.0}


 96%|█████████▋| 11002/11430 [15:12<00:32, 13.37it/s]

{'loss': 0.0016, 'grad_norm': 0.0032892210874706507, 'learning_rate': 1.8810148731408575e-06, 'epoch': 14.44}


100%|██████████| 11430/11430 [15:45<00:00, 13.33it/s]
100%|██████████| 11430/11430 [15:50<00:00, 13.33it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.13385044038295746, 'eval_f1': 0.866303440340593, 'eval_precision': 0.8788358435525297, 'eval_recall': 0.8555293069186499, 'eval_runtime': 3.9648, 'eval_samples_per_second': 384.134, 'eval_steps_per_second': 48.174, 'epoch': 15.0}


100%|██████████| 11430/11430 [15:53<00:00, 11.99it/s]


{'train_runtime': 953.3229, 'train_samples_per_second': 95.838, 'train_steps_per_second': 11.99, 'train_loss': 0.02915453580212197, 'epoch': 15.0}


100%|██████████| 191/191 [00:03<00:00, 48.57it/s]



 ** Evaluating metrics for java test set. ** 

Evaluation Metrics: {'eval_loss': 0.12514810264110565, 'eval_f1': 0.8678764646845367, 'eval_precision': 0.8801444835202298, 'eval_recall': 0.8571586448228181, 'eval_runtime': 3.9897, 'eval_samples_per_second': 381.734, 'eval_steps_per_second': 47.873, 'epoch': 15.0}


0,1
eval/f1,▁▆▇▇█▆██████████
eval/loss,▆▁▂▂▃▅▄▆█▇▇█▇██▇
eval/precision,▁▇▆▇▇▇█▇▇▇▇▇▇▇▇▇
eval/recall,▁▄▇▇▇▆▇█▇██▇▇███
eval/runtime,▁▆▅█▄█▄▇▃▅▃▂▅▃▄▅
eval/samples_per_second,█▃▄▁▄▁▅▂▆▄▅▆▄▆▅▄
eval/steps_per_second,█▃▄▁▄▁▅▂▆▄▅▆▄▆▅▄
train/epoch,▁▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇████
train/global_step,▁▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇████
train/grad_norm,█▁▄▂▄▁▂▁▁▁▁

0,1
eval/f1,0.86788
eval/loss,0.12515
eval/precision,0.88014
eval/recall,0.85716
eval/runtime,3.9897
eval/samples_per_second,381.734
eval/steps_per_second,47.873
total_flos,6010055190432000.0
train/epoch,15.0
train/global_step,11430.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1884/1884 [00:00<00:00, 11775.10 examples/s]
  7%|▋         | 189/2835 [00:14<03:03, 14.38it/s]
  7%|▋         | 189/2835 [00:15<03:03, 14.38it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.35030344128608704, 'eval_f1': 0.3438159419107449, 'eval_precision': 0.5732959850606909, 'eval_recall': 0.2675579401918132, 'eval_runtime': 0.9457, 'eval_samples_per_second': 398.658, 'eval_steps_per_second': 50.758, 'epoch': 1.0}


 13%|█▎        | 377/2835 [00:31<02:59, 13.68it/s]
 13%|█▎        | 378/2835 [00:32<02:59, 13.68it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.29894378781318665, 'eval_f1': 0.5157309289004106, 'eval_precision': 0.6518345100953796, 'eval_recall': 0.4439548111390117, 'eval_runtime': 0.9646, 'eval_samples_per_second': 390.826, 'eval_steps_per_second': 49.76, 'epoch': 2.0}


 20%|█▉        | 566/2835 [00:47<02:50, 13.32it/s]
 20%|██        | 567/2835 [00:48<02:50, 13.32it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.32926657795906067, 'eval_f1': 0.5576744123085586, 'eval_precision': 0.6266709151279162, 'eval_recall': 0.537663568225591, 'eval_runtime': 0.9915, 'eval_samples_per_second': 380.24, 'eval_steps_per_second': 48.413, 'epoch': 3.0}


 27%|██▋       | 756/2835 [01:04<02:25, 14.32it/s]
 27%|██▋       | 756/2835 [01:05<02:25, 14.32it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.3317989706993103, 'eval_f1': 0.6797385485919337, 'eval_precision': 0.7746695520227569, 'eval_recall': 0.6221515525371988, 'eval_runtime': 0.9565, 'eval_samples_per_second': 394.149, 'eval_steps_per_second': 50.183, 'epoch': 4.0}


 33%|███▎      | 944/2835 [01:21<02:18, 13.61it/s]
 33%|███▎      | 945/2835 [01:22<02:18, 13.61it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.3333657383918762, 'eval_f1': 0.7107777445155354, 'eval_precision': 0.7603139504056935, 'eval_recall': 0.674128019362737, 'eval_runtime': 0.9568, 'eval_samples_per_second': 394.01, 'eval_steps_per_second': 50.166, 'epoch': 5.0}


 35%|███▌      | 1002/2835 [01:28<02:16, 13.46it/s]

{'loss': 0.2486, 'grad_norm': 6.378255367279053, 'learning_rate': 3.2363315696649034e-05, 'epoch': 5.29}


 40%|████      | 1134/2835 [01:38<02:03, 13.77it/s]
 40%|████      | 1134/2835 [01:39<02:03, 13.77it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.36463794112205505, 'eval_f1': 0.7220249021272283, 'eval_precision': 0.7610555480773544, 'eval_recall': 0.7079361854508044, 'eval_runtime': 1.0256, 'eval_samples_per_second': 367.586, 'eval_steps_per_second': 46.801, 'epoch': 6.0}


 47%|████▋     | 1322/2835 [01:55<01:50, 13.69it/s]
 47%|████▋     | 1323/2835 [01:56<01:50, 13.69it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.38128677010536194, 'eval_f1': 0.7261184844801043, 'eval_precision': 0.7553513540892821, 'eval_recall': 0.723849815380353, 'eval_runtime': 0.9899, 'eval_samples_per_second': 380.862, 'eval_steps_per_second': 48.492, 'epoch': 7.0}


 53%|█████▎    | 1511/2835 [02:12<01:37, 13.60it/s]
 53%|█████▎    | 1512/2835 [02:13<01:37, 13.60it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.36190953850746155, 'eval_f1': 0.7409504306931771, 'eval_precision': 0.7660635370480467, 'eval_recall': 0.724728802610388, 'eval_runtime': 0.9678, 'eval_samples_per_second': 389.532, 'eval_steps_per_second': 49.596, 'epoch': 8.0}


 60%|██████    | 1701/2835 [02:28<01:24, 13.45it/s]
 60%|██████    | 1701/2835 [02:30<01:24, 13.45it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.40595486760139465, 'eval_f1': 0.7350937500575747, 'eval_precision': 0.7372076877361211, 'eval_recall': 0.744985034592891, 'eval_runtime': 1.0718, 'eval_samples_per_second': 351.735, 'eval_steps_per_second': 44.783, 'epoch': 9.0}


 67%|██████▋   | 1889/2835 [02:45<01:09, 13.64it/s]
 67%|██████▋   | 1890/2835 [02:46<01:09, 13.64it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.4184468686580658, 'eval_f1': 0.7277445517766029, 'eval_precision': 0.7290696912182815, 'eval_recall': 0.733162852858153, 'eval_runtime': 0.9665, 'eval_samples_per_second': 390.068, 'eval_steps_per_second': 49.664, 'epoch': 10.0}


 71%|███████   | 2002/2835 [02:57<01:02, 13.28it/s]

{'loss': 0.0423, 'grad_norm': 9.449909210205078, 'learning_rate': 1.472663139329806e-05, 'epoch': 10.58}


 73%|███████▎  | 2078/2835 [03:02<00:55, 13.63it/s]
 73%|███████▎  | 2079/2835 [03:03<00:55, 13.63it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.4062790274620056, 'eval_f1': 0.7480566963192846, 'eval_precision': 0.7534289901060023, 'eval_recall': 0.7527371875512001, 'eval_runtime': 0.9655, 'eval_samples_per_second': 390.466, 'eval_steps_per_second': 49.715, 'epoch': 11.0}


 80%|████████  | 2268/2835 [03:19<00:42, 13.33it/s]
 80%|████████  | 2268/2835 [03:20<00:42, 13.33it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.4071006774902344, 'eval_f1': 0.7574184296094323, 'eval_precision': 0.7558308558595118, 'eval_recall': 0.7656493998718807, 'eval_runtime': 1.0039, 'eval_samples_per_second': 375.52, 'eval_steps_per_second': 47.812, 'epoch': 12.0}


 87%|████████▋ | 2456/2835 [03:36<00:27, 13.56it/s]
 87%|████████▋ | 2457/2835 [03:37<00:27, 13.56it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.41674861311912537, 'eval_f1': 0.7630689980679074, 'eval_precision': 0.7628386460565239, 'eval_recall': 0.7708024333040576, 'eval_runtime': 0.9746, 'eval_samples_per_second': 386.825, 'eval_steps_per_second': 49.251, 'epoch': 13.0}


 93%|█████████▎| 2646/2835 [03:53<00:13, 14.10it/s]
 93%|█████████▎| 2646/2835 [03:54<00:13, 14.10it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.4142097532749176, 'eval_f1': 0.7552708655605375, 'eval_precision': 0.7475639563579853, 'eval_recall': 0.7690966048408249, 'eval_runtime': 0.9851, 'eval_samples_per_second': 382.707, 'eval_steps_per_second': 48.727, 'epoch': 14.0}


100%|█████████▉| 2834/2835 [04:10<00:00, 12.87it/s]
100%|██████████| 2835/2835 [04:13<00:00, 12.87it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.4149714708328247, 'eval_f1': 0.7551023463283353, 'eval_precision': 0.7538590935774035, 'eval_recall': 0.7629182719657299, 'eval_runtime': 1.0577, 'eval_samples_per_second': 356.446, 'eval_steps_per_second': 45.383, 'epoch': 15.0}


100%|██████████| 2835/2835 [04:15<00:00, 11.10it/s]


{'train_runtime': 255.3547, 'train_samples_per_second': 88.524, 'train_steps_per_second': 11.102, 'train_loss': 0.10482793014095784, 'epoch': 15.0}


100%|██████████| 48/48 [00:01<00:00, 45.60it/s]



 ** Evaluating metrics for python test set. ** 

Evaluation Metrics: {'eval_loss': 0.41674861311912537, 'eval_f1': 0.7630689980679074, 'eval_precision': 0.7628386460565239, 'eval_recall': 0.7708024333040576, 'eval_runtime': 1.1087, 'eval_samples_per_second': 340.046, 'eval_steps_per_second': 43.295, 'epoch': 15.0}


0,1
eval/f1,▁▄▅▇▇▇▇██▇██████
eval/loss,▄▁▃▃▃▅▆▅▇█▇▇████
eval/precision,▁▄▃███▇█▇▆▇▇█▇▇█
eval/recall,▁▃▅▆▇▇▇▇█▇██████
eval/runtime,▁▂▃▁▁▄▃▂▆▂▂▃▂▃▆█
eval/samples_per_second,█▇▆▇▇▄▆▇▂▇▇▅▇▆▃▁
eval/steps_per_second,█▇▆▇▇▄▆▇▂▇▇▅▇▆▃▁
train/epoch,▁▁▂▃▃▃▃▄▅▅▅▆▆▇▇▇███
train/global_step,▁▁▂▃▃▃▃▄▅▅▅▆▆▇▇▇███
train/grad_norm,▁█

0,1
eval/f1,0.76307
eval/loss,0.41675
eval/precision,0.76284
eval/recall,0.7708
eval/runtime,1.1087
eval/samples_per_second,340.046
eval/steps_per_second,43.295
total_flos,1486946402599680.0
train/epoch,15.0
train/global_step,2835.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1298/1298 [00:00<00:00, 9605.27 examples/s]
  7%|▋         | 129/1950 [00:09<02:12, 13.79it/s]
  7%|▋         | 130/1950 [00:10<02:12, 13.79it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.3036459684371948, 'eval_f1': 0.16790838182012455, 'eval_precision': 0.19576719576719576, 'eval_recall': 0.1502970297029703, 'eval_runtime': 0.6548, 'eval_samples_per_second': 397.053, 'eval_steps_per_second': 50.395, 'epoch': 1.0}


 13%|█▎        | 259/1950 [00:21<02:03, 13.68it/s]
 13%|█▎        | 260/1950 [00:22<02:03, 13.68it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2558704912662506, 'eval_f1': 0.42431106121034823, 'eval_precision': 0.5862403317491381, 'eval_recall': 0.3499028364796852, 'eval_runtime': 0.6765, 'eval_samples_per_second': 384.351, 'eval_steps_per_second': 48.783, 'epoch': 2.0}


 20%|██        | 390/1950 [00:33<01:51, 13.98it/s]
 20%|██        | 390/1950 [00:34<01:51, 13.98it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.236893430352211, 'eval_f1': 0.5144439995781548, 'eval_precision': 0.5890427599611273, 'eval_recall': 0.4691778137751614, 'eval_runtime': 0.656, 'eval_samples_per_second': 396.328, 'eval_steps_per_second': 50.303, 'epoch': 3.0}


 27%|██▋       | 519/1950 [00:46<01:46, 13.37it/s]
 27%|██▋       | 520/1950 [00:47<01:46, 13.37it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.22038377821445465, 'eval_f1': 0.566415429808287, 'eval_precision': 0.7385803106667768, 'eval_recall': 0.5215861405598018, 'eval_runtime': 0.7558, 'eval_samples_per_second': 343.986, 'eval_steps_per_second': 43.66, 'epoch': 4.0}


 33%|███▎      | 650/1950 [00:59<01:36, 13.50it/s]
 33%|███▎      | 650/1950 [01:00<01:36, 13.50it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2480241060256958, 'eval_f1': 0.6110421898294869, 'eval_precision': 0.8258051997071959, 'eval_recall': 0.5401676525720995, 'eval_runtime': 0.6802, 'eval_samples_per_second': 382.258, 'eval_steps_per_second': 48.517, 'epoch': 5.0}


 40%|████      | 780/1950 [01:11<01:27, 13.35it/s]
 40%|████      | 780/1950 [01:12<01:27, 13.35it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.22338488698005676, 'eval_f1': 0.5970871609760678, 'eval_precision': 0.7968823178312887, 'eval_recall': 0.5602758476135568, 'eval_runtime': 0.7093, 'eval_samples_per_second': 366.579, 'eval_steps_per_second': 46.527, 'epoch': 6.0}


 47%|████▋     | 909/1950 [01:23<01:18, 13.23it/s]
 47%|████▋     | 910/1950 [01:24<01:18, 13.23it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2191298007965088, 'eval_f1': 0.6686150537448535, 'eval_precision': 0.8583780560524746, 'eval_recall': 0.5939757742485859, 'eval_runtime': 0.6553, 'eval_samples_per_second': 396.768, 'eval_steps_per_second': 50.359, 'epoch': 7.0}


 51%|█████▏    | 1002/1950 [01:33<01:13, 12.89it/s]

{'loss': 0.1615, 'grad_norm': 0.5588142275810242, 'learning_rate': 2.435897435897436e-05, 'epoch': 7.69}


 53%|█████▎    | 1040/1950 [01:36<01:08, 13.34it/s]
 53%|█████▎    | 1040/1950 [01:36<01:08, 13.34it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.23590117692947388, 'eval_f1': 0.6483335512041856, 'eval_precision': 0.8257794144641927, 'eval_recall': 0.5969895675720646, 'eval_runtime': 0.6873, 'eval_samples_per_second': 378.303, 'eval_steps_per_second': 48.015, 'epoch': 8.0}


 60%|██████    | 1170/1950 [01:48<00:58, 13.44it/s]
 60%|██████    | 1170/1950 [01:48<00:58, 13.44it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.23401305079460144, 'eval_f1': 0.6711011575148292, 'eval_precision': 0.7769060802314041, 'eval_recall': 0.6141749321994384, 'eval_runtime': 0.6618, 'eval_samples_per_second': 392.843, 'eval_steps_per_second': 49.861, 'epoch': 9.0}


 67%|██████▋   | 1299/1950 [02:00<00:49, 13.20it/s]
 67%|██████▋   | 1300/1950 [02:01<00:49, 13.20it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.24790766835212708, 'eval_f1': 0.6740172536991483, 'eval_precision': 0.7809820818279464, 'eval_recall': 0.6290379504390093, 'eval_runtime': 0.7233, 'eval_samples_per_second': 359.459, 'eval_steps_per_second': 45.624, 'epoch': 10.0}


 73%|███████▎  | 1429/1950 [02:12<00:39, 13.23it/s]
 73%|███████▎  | 1430/1950 [02:12<00:39, 13.23it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.24342216551303864, 'eval_f1': 0.7096027750900246, 'eval_precision': 0.8043821640315879, 'eval_recall': 0.654232624053136, 'eval_runtime': 0.6817, 'eval_samples_per_second': 381.421, 'eval_steps_per_second': 48.411, 'epoch': 11.0}


 80%|███████▉  | 1559/1950 [02:24<00:29, 13.20it/s]
 80%|████████  | 1560/1950 [02:25<00:29, 13.20it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.24256956577301025, 'eval_f1': 0.7008562024954753, 'eval_precision': 0.7590954406841107, 'eval_recall': 0.659735387552513, 'eval_runtime': 0.7231, 'eval_samples_per_second': 359.554, 'eval_steps_per_second': 45.636, 'epoch': 12.0}


 87%|████████▋ | 1689/1950 [02:36<00:19, 13.47it/s]
 87%|████████▋ | 1690/1950 [02:37<00:19, 13.47it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.24454239010810852, 'eval_f1': 0.6951668821620638, 'eval_precision': 0.7773649929179882, 'eval_recall': 0.6445651935671434, 'eval_runtime': 0.7099, 'eval_samples_per_second': 366.239, 'eval_steps_per_second': 46.484, 'epoch': 13.0}


 93%|█████████▎| 1820/1950 [02:48<00:09, 13.39it/s]
 93%|█████████▎| 1820/1950 [02:49<00:09, 13.39it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.250672847032547, 'eval_f1': 0.7119987574754617, 'eval_precision': 0.7942971200700973, 'eval_recall': 0.6649657902920352, 'eval_runtime': 0.7076, 'eval_samples_per_second': 367.433, 'eval_steps_per_second': 46.636, 'epoch': 14.0}


100%|██████████| 1950/1950 [03:00<00:00, 13.79it/s]
100%|██████████| 1950/1950 [03:03<00:00, 13.79it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2492733597755432, 'eval_f1': 0.7188116022467084, 'eval_precision': 0.8059085135576177, 'eval_recall': 0.6697815779932543, 'eval_runtime': 0.6847, 'eval_samples_per_second': 379.745, 'eval_steps_per_second': 48.198, 'epoch': 15.0}


100%|██████████| 1950/1950 [03:05<00:00, 10.53it/s]


{'train_runtime': 185.1698, 'train_samples_per_second': 84.085, 'train_steps_per_second': 10.531, 'train_loss': 0.0941875721858098, 'epoch': 15.0}


100%|██████████| 33/33 [00:00<00:00, 43.71it/s]



 ** Evaluating metrics for pharo test set. ** 

Evaluation Metrics: {'eval_loss': 0.2492733597755432, 'eval_f1': 0.7188116022467084, 'eval_precision': 0.8059085135576177, 'eval_recall': 0.6697815779932543, 'eval_runtime': 0.8099, 'eval_samples_per_second': 321.046, 'eval_steps_per_second': 40.748, 'epoch': 15.0}
    comparison                                     model     lan  \
0   validation  epoch-15_batchSize-8_weightsOfDecay-0.01    java   
1   validation  epoch-15_batchSize-8_weightsOfDecay-0.01    java   
2   validation  epoch-15_batchSize-8_weightsOfDecay-0.01    java   
3   validation  epoch-15_batchSize-8_weightsOfDecay-0.01    java   
4   validation  epoch-15_batchSize-8_weightsOfDecay-0.01    java   
5   validation  epoch-15_batchSize-8_weightsOfDecay-0.01    java   
6   validation  epoch-15_batchSize-8_weightsOfDecay-0.01    java   
7   validation  epoch-15_batchSize-8_weightsOfDecay-0.01  python   
8   validation  epoch-15_batchSize-8_weightsOfDecay-0.01  python   
9   v

0,1
eval/f1,▁▄▅▆▇▆▇▇▇▇██████
eval/loss,█▄▂▁▃▁▁▂▂▃▃▃▃▄▃▃
eval/precision,▁▅▅▇█▇██▇▇▇▇▇▇▇▇
eval/recall,▁▄▅▆▆▇▇▇▇▇██████
eval/runtime,▁▂▁▆▂▃▁▂▁▄▂▄▃▃▂█
eval/samples_per_second,█▇█▃▇▅█▆█▅▇▅▅▅▆▁
eval/steps_per_second,█▇█▃▇▅█▆█▅▇▅▅▅▆▁
train/epoch,▁▁▂▃▃▃▄▄▅▅▅▆▇▇▇███
train/global_step,▁▁▂▃▃▃▄▄▅▅▅▆▇▇▇███
train/grad_norm,▁

0,1
eval/f1,0.71881
eval/loss,0.24927
eval/precision,0.80591
eval/recall,0.66978
eval/runtime,0.8099
eval/samples_per_second,321.046
eval/steps_per_second,40.748
total_flos,1024205760576000.0
train/epoch,15.0
train/global_step,1950.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 7614/7614 [00:00<00:00, 16272.57 examples/s]
  3%|▎         | 1002/30460 [01:03<30:21, 16.17it/s]

{'loss': 0.1586, 'grad_norm': 0.15519636869430542, 'learning_rate': 4.8358502954694684e-05, 'epoch': 0.66}


  5%|▍         | 1522/30460 [01:36<30:40, 15.72it/s]
  5%|▌         | 1523/30460 [01:43<30:40, 15.72it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.11901015788316727, 'eval_f1': 0.525958749987395, 'eval_precision': 0.5293984414398859, 'eval_recall': 0.523017530209423, 'eval_runtime': 6.9585, 'eval_samples_per_second': 218.87, 'eval_steps_per_second': 54.754, 'epoch': 1.0}


  7%|▋         | 2002/30460 [02:15<31:50, 14.89it/s]   

{'loss': 0.1095, 'grad_norm': 8.95475959777832, 'learning_rate': 4.6717005909389365e-05, 'epoch': 1.31}


 10%|▉         | 3002/30460 [03:18<28:28, 16.08it/s]

{'loss': 0.0976, 'grad_norm': 0.6023632287979126, 'learning_rate': 4.507550886408405e-05, 'epoch': 1.97}


 10%|█         | 3046/30460 [03:21<29:48, 15.33it/s]
 10%|█         | 3046/30460 [03:27<29:48, 15.33it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.09333226829767227, 'eval_f1': 0.7774207742257129, 'eval_precision': 0.894692668940965, 'eval_recall': 0.7311580699789071, 'eval_runtime': 6.3451, 'eval_samples_per_second': 240.029, 'eval_steps_per_second': 60.047, 'epoch': 2.0}


 13%|█▎        | 4002/30460 [04:29<27:14, 16.19it/s]  

{'loss': 0.0753, 'grad_norm': 0.045328378677368164, 'learning_rate': 4.343401181877873e-05, 'epoch': 2.63}


 15%|█▍        | 4568/30460 [05:05<26:43, 16.14it/s]
 15%|█▌        | 4569/30460 [05:11<26:43, 16.14it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.0923575833439827, 'eval_f1': 0.8162786367080856, 'eval_precision': 0.838373295256532, 'eval_recall': 0.7981771052440053, 'eval_runtime': 5.8113, 'eval_samples_per_second': 262.076, 'eval_steps_per_second': 65.562, 'epoch': 3.0}


 16%|█▋        | 5002/30460 [05:40<25:58, 16.33it/s]  

{'loss': 0.0657, 'grad_norm': 16.533084869384766, 'learning_rate': 4.179251477347341e-05, 'epoch': 3.28}


 20%|█▉        | 6002/30460 [06:43<27:09, 15.01it/s]

{'loss': 0.0568, 'grad_norm': 4.497607231140137, 'learning_rate': 4.015101772816809e-05, 'epoch': 3.94}


 20%|██        | 6092/30460 [06:49<24:38, 16.48it/s]
 20%|██        | 6092/30460 [06:55<24:38, 16.48it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.10009157657623291, 'eval_f1': 0.8400938519830727, 'eval_precision': 0.8708343279599219, 'eval_recall': 0.8142905419120252, 'eval_runtime': 6.0025, 'eval_samples_per_second': 253.726, 'eval_steps_per_second': 63.473, 'epoch': 4.0}


 23%|██▎       | 7002/30460 [07:54<25:16, 15.47it/s]  

{'loss': 0.0413, 'grad_norm': 0.007817510515451431, 'learning_rate': 3.850952068286277e-05, 'epoch': 4.6}


 25%|██▍       | 7614/30460 [08:32<26:29, 14.37it/s]
 25%|██▌       | 7615/30460 [08:39<26:29, 14.37it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.11629816889762878, 'eval_f1': 0.8470527750452176, 'eval_precision': 0.8577189622328284, 'eval_recall': 0.8380720728585276, 'eval_runtime': 6.8246, 'eval_samples_per_second': 223.162, 'eval_steps_per_second': 55.827, 'epoch': 5.0}


 26%|██▋       | 8002/30460 [09:06<23:12, 16.13it/s]  

{'loss': 0.0399, 'grad_norm': 1.609573245048523, 'learning_rate': 3.6868023637557454e-05, 'epoch': 5.25}


 30%|██▉       | 9002/30460 [10:09<22:47, 15.69it/s]

{'loss': 0.0321, 'grad_norm': 2.9567744731903076, 'learning_rate': 3.5226526592252135e-05, 'epoch': 5.91}


 30%|███       | 9138/30460 [10:17<21:05, 16.85it/s]
 30%|███       | 9138/30460 [10:24<21:05, 16.85it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.13707908987998962, 'eval_f1': 0.8093048395665736, 'eval_precision': 0.8525837833797454, 'eval_recall': 0.7763155962721372, 'eval_runtime': 6.53, 'eval_samples_per_second': 233.229, 'eval_steps_per_second': 58.346, 'epoch': 6.0}


 33%|███▎      | 10002/30460 [11:19<22:54, 14.88it/s] 

{'loss': 0.0277, 'grad_norm': 0.02411346323788166, 'learning_rate': 3.3585029546946817e-05, 'epoch': 6.57}


 35%|███▍      | 10660/30460 [12:01<21:08, 15.61it/s]
 35%|███▌      | 10661/30460 [12:07<21:08, 15.61it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.12553171813488007, 'eval_f1': 0.8438809523696537, 'eval_precision': 0.8424995365230378, 'eval_recall': 0.8459957417729085, 'eval_runtime': 5.7093, 'eval_samples_per_second': 266.757, 'eval_steps_per_second': 66.733, 'epoch': 7.0}


 36%|███▌      | 11002/30460 [12:30<20:06, 16.13it/s]  

{'loss': 0.0268, 'grad_norm': 0.00794950220733881, 'learning_rate': 3.19435325016415e-05, 'epoch': 7.22}


 39%|███▉      | 12002/30460 [13:32<18:34, 16.57it/s]

{'loss': 0.0205, 'grad_norm': 0.00551955122500658, 'learning_rate': 3.030203545633618e-05, 'epoch': 7.88}


 40%|████      | 12184/30460 [13:44<18:30, 16.46it/s]
 40%|████      | 12184/30460 [13:50<18:30, 16.46it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.13599897921085358, 'eval_f1': 0.8391863303557712, 'eval_precision': 0.8239611463734394, 'eval_recall': 0.8572893044904274, 'eval_runtime': 6.2228, 'eval_samples_per_second': 244.745, 'eval_steps_per_second': 61.227, 'epoch': 8.0}


 43%|████▎     | 13002/30460 [14:43<19:32, 14.88it/s]  

{'loss': 0.0153, 'grad_norm': 0.011293191462755203, 'learning_rate': 2.8660538411030864e-05, 'epoch': 8.54}


 45%|████▍     | 13706/30460 [15:27<17:48, 15.68it/s]
 45%|████▌     | 13707/30460 [15:34<17:48, 15.68it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.15169255435466766, 'eval_f1': 0.8364481297581653, 'eval_precision': 0.8533720125543205, 'eval_recall': 0.8231716864699578, 'eval_runtime': 6.675, 'eval_samples_per_second': 228.165, 'eval_steps_per_second': 57.079, 'epoch': 9.0}


 46%|████▌     | 14002/30460 [15:54<18:17, 14.99it/s]  

{'loss': 0.017, 'grad_norm': 0.03893284127116203, 'learning_rate': 2.7019041365725546e-05, 'epoch': 9.19}


 49%|████▉     | 15002/30460 [16:57<16:17, 15.81it/s]

{'loss': 0.0168, 'grad_norm': 0.012770706787705421, 'learning_rate': 2.5377544320420227e-05, 'epoch': 9.85}


 50%|█████     | 15230/30460 [17:11<15:16, 16.61it/s]
 50%|█████     | 15230/30460 [17:18<15:16, 16.61it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.15066884458065033, 'eval_f1': 0.8425547303161791, 'eval_precision': 0.8493971091655252, 'eval_recall': 0.8375153614197516, 'eval_runtime': 6.7372, 'eval_samples_per_second': 226.06, 'eval_steps_per_second': 56.552, 'epoch': 10.0}


 53%|█████▎    | 16002/30460 [18:09<14:39, 16.44it/s]  

{'loss': 0.0123, 'grad_norm': 0.007522749248892069, 'learning_rate': 2.3736047275114905e-05, 'epoch': 10.51}


 55%|█████▍    | 16752/30460 [18:56<13:59, 16.32it/s]
 55%|█████▌    | 16753/30460 [19:02<13:59, 16.32it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.12663614749908447, 'eval_f1': 0.864316729816119, 'eval_precision': 0.8594225639951215, 'eval_recall': 0.8703994395223748, 'eval_runtime': 5.684, 'eval_samples_per_second': 267.944, 'eval_steps_per_second': 67.03, 'epoch': 11.0}


 56%|█████▌    | 17002/30460 [19:19<14:04, 15.93it/s]  

{'loss': 0.0101, 'grad_norm': 0.003383345203474164, 'learning_rate': 2.2094550229809586e-05, 'epoch': 11.16}


 59%|█████▉    | 18002/30460 [20:23<13:08, 15.80it/s]

{'loss': 0.0114, 'grad_norm': 0.005847027525305748, 'learning_rate': 2.045305318450427e-05, 'epoch': 11.82}


 60%|██████    | 18276/30460 [20:40<12:38, 16.07it/s]
 60%|██████    | 18276/30460 [20:47<12:38, 16.07it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.15161816775798798, 'eval_f1': 0.8439876784420038, 'eval_precision': 0.8567704156184851, 'eval_recall': 0.8348560743557186, 'eval_runtime': 6.375, 'eval_samples_per_second': 238.904, 'eval_steps_per_second': 59.765, 'epoch': 12.0}


 62%|██████▏   | 19002/30460 [21:34<11:42, 16.31it/s]  

{'loss': 0.0048, 'grad_norm': 0.010884199291467667, 'learning_rate': 1.881155613919895e-05, 'epoch': 12.48}


 65%|██████▍   | 19798/30460 [22:23<12:02, 14.76it/s]
 65%|██████▌   | 19799/30460 [22:30<12:02, 14.76it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.16219905018806458, 'eval_f1': 0.8371787563941178, 'eval_precision': 0.8424486664184876, 'eval_recall': 0.8366784958635429, 'eval_runtime': 6.8681, 'eval_samples_per_second': 221.749, 'eval_steps_per_second': 55.474, 'epoch': 13.0}


 66%|██████▌   | 20002/30460 [22:45<11:32, 15.10it/s]  

{'loss': 0.0067, 'grad_norm': 0.009802866727113724, 'learning_rate': 1.717005909389363e-05, 'epoch': 13.13}


 69%|██████▉   | 21002/30460 [23:47<11:02, 14.28it/s]

{'loss': 0.0061, 'grad_norm': 1.343064546585083, 'learning_rate': 1.5528562048588312e-05, 'epoch': 13.79}


 70%|███████   | 21322/30460 [24:08<08:59, 16.93it/s]
 70%|███████   | 21322/30460 [24:14<08:59, 16.93it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.14505991339683533, 'eval_f1': 0.8608233486272413, 'eval_precision': 0.8634930506793953, 'eval_recall': 0.8588050628336765, 'eval_runtime': 6.2489, 'eval_samples_per_second': 243.723, 'eval_steps_per_second': 60.971, 'epoch': 14.0}


 72%|███████▏  | 22002/30460 [24:59<08:27, 16.67it/s]  

{'loss': 0.0057, 'grad_norm': 0.07342442870140076, 'learning_rate': 1.3887065003282995e-05, 'epoch': 14.45}


 75%|███████▍  | 22844/30460 [25:52<08:18, 15.29it/s]
 75%|███████▌  | 22845/30460 [25:57<08:17, 15.29it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.14825697243213654, 'eval_f1': 0.8703402636121986, 'eval_precision': 0.8851981476088142, 'eval_recall': 0.8600447280292675, 'eval_runtime': 5.5989, 'eval_samples_per_second': 272.02, 'eval_steps_per_second': 68.05, 'epoch': 15.0}


 76%|███████▌  | 23002/30460 [26:09<07:58, 15.57it/s]  

{'loss': 0.005, 'grad_norm': 0.007914071902632713, 'learning_rate': 1.2245567957977677e-05, 'epoch': 15.1}


 79%|███████▉  | 24002/30460 [27:12<06:48, 15.83it/s]

{'loss': 0.0025, 'grad_norm': 0.003541424171999097, 'learning_rate': 1.0604070912672358e-05, 'epoch': 15.76}


 80%|████████  | 24368/30460 [27:36<06:13, 16.31it/s]
 80%|████████  | 24368/30460 [27:42<06:13, 16.31it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.16622476279735565, 'eval_f1': 0.8501053650917363, 'eval_precision': 0.8595388781670705, 'eval_recall': 0.8427954042322444, 'eval_runtime': 6.1517, 'eval_samples_per_second': 247.572, 'eval_steps_per_second': 61.934, 'epoch': 16.0}


 82%|████████▏ | 25002/30460 [28:23<06:20, 14.33it/s]  

{'loss': 0.0034, 'grad_norm': 0.0004726785409729928, 'learning_rate': 8.96257386736704e-06, 'epoch': 16.41}


 85%|████████▍ | 25890/30460 [29:20<05:04, 15.01it/s]
 85%|████████▌ | 25891/30460 [29:27<05:04, 15.01it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.15883567929267883, 'eval_f1': 0.8590408314619568, 'eval_precision': 0.859063909384827, 'eval_recall': 0.8596053678375852, 'eval_runtime': 7.3415, 'eval_samples_per_second': 207.452, 'eval_steps_per_second': 51.897, 'epoch': 17.0}


 85%|████████▌ | 26002/30460 [29:36<04:32, 16.35it/s]  

{'loss': 0.0027, 'grad_norm': 0.006902558729052544, 'learning_rate': 7.321076822061721e-06, 'epoch': 17.07}


 89%|████████▊ | 27002/30460 [30:38<03:34, 16.13it/s]

{'loss': 0.0024, 'grad_norm': 0.00492890365421772, 'learning_rate': 5.679579776756402e-06, 'epoch': 17.73}


 90%|█████████ | 27414/30460 [31:04<03:12, 15.81it/s]
 90%|█████████ | 27414/30460 [31:11<03:12, 15.81it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.1659005731344223, 'eval_f1': 0.8573574017788523, 'eval_precision': 0.867826691338208, 'eval_recall': 0.8498136679510832, 'eval_runtime': 6.2749, 'eval_samples_per_second': 242.713, 'eval_steps_per_second': 60.718, 'epoch': 18.0}


 92%|█████████▏| 28002/30460 [31:49<02:35, 15.85it/s]  

{'loss': 0.001, 'grad_norm': 0.0018304269760847092, 'learning_rate': 4.038082731451084e-06, 'epoch': 18.38}


 95%|█████████▍| 28936/30460 [32:48<01:29, 16.97it/s]
 95%|█████████▌| 28937/30460 [32:54<01:29, 16.97it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.16298989951610565, 'eval_f1': 0.856750623402685, 'eval_precision': 0.8563366878004111, 'eval_recall': 0.8580520042762606, 'eval_runtime': 5.9027, 'eval_samples_per_second': 258.019, 'eval_steps_per_second': 64.547, 'epoch': 19.0}


 95%|█████████▌| 29002/30460 [33:00<01:43, 14.08it/s]

{'loss': 0.0019, 'grad_norm': 0.002089886227622628, 'learning_rate': 2.396585686145765e-06, 'epoch': 19.04}


 98%|█████████▊| 30002/30460 [34:04<00:28, 15.86it/s]

{'loss': 0.0015, 'grad_norm': 0.0005213675904087722, 'learning_rate': 7.550886408404465e-07, 'epoch': 19.7}


100%|██████████| 30460/30460 [34:33<00:00, 16.51it/s]
100%|██████████| 30460/30460 [34:41<00:00, 16.51it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.16292807459831238, 'eval_f1': 0.8611469652413889, 'eval_precision': 0.8686743854364823, 'eval_recall': 0.8547743372195808, 'eval_runtime': 6.3745, 'eval_samples_per_second': 238.921, 'eval_steps_per_second': 59.77, 'epoch': 20.0}


100%|██████████| 30460/30460 [34:43<00:00, 14.62it/s]


{'train_runtime': 2083.9411, 'train_samples_per_second': 58.457, 'train_steps_per_second': 14.617, 'train_loss': 0.028843043367387588, 'epoch': 20.0}


100%|██████████| 381/381 [00:06<00:00, 62.94it/s]



 ** Evaluating metrics for java test set. ** 

Evaluation Metrics: {'eval_loss': 0.14825697243213654, 'eval_f1': 0.8703402636121986, 'eval_precision': 0.8851981476088142, 'eval_recall': 0.8600447280292675, 'eval_runtime': 6.0886, 'eval_samples_per_second': 250.14, 'eval_steps_per_second': 62.576, 'epoch': 20.0}


0,1
eval/f1,▁▆▇▇█▇▇▇▇▇█▇▇████████
eval/loss,▄▁▁▂▃▅▄▅▇▇▄▇█▆▆█▇███▆
eval/precision,▁█▇█▇▇▇▇▇▇▇▇▇▇█▇▇▇▇██
eval/recall,▁▅▇▇▇▆██▇▇█▇▇██▇█████
eval/runtime,▆▄▂▃▆▅▁▄▅▆▁▄▆▄▁▃█▄▂▄▃
eval/samples_per_second,▂▅▇▆▃▄▇▅▃▃█▄▃▅█▅▁▅▆▄▆
eval/steps_per_second,▂▅▇▆▃▄▇▅▃▃█▄▃▅█▅▁▅▆▄▆
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/grad_norm,▁▅▁▁█▃▁▂▂▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁

0,1
eval/f1,0.87034
eval/loss,0.14826
eval/precision,0.8852
eval/recall,0.86004
eval/runtime,6.0886
eval/samples_per_second,250.14
eval/steps_per_second,62.576
total_flos,8013406920576000.0
train/epoch,20.0
train/global_step,30460.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1884/1884 [00:00<00:00, 9491.52 examples/s] 
  5%|▌         | 377/7540 [00:23<07:47, 15.31it/s]
  5%|▌         | 377/7540 [00:25<07:47, 15.31it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.32231611013412476, 'eval_f1': 0.4463753167069841, 'eval_precision': 0.5321651902032308, 'eval_recall': 0.3927079458524042, 'eval_runtime': 1.8857, 'eval_samples_per_second': 199.93, 'eval_steps_per_second': 50.38, 'epoch': 1.0}


 10%|▉         | 753/7540 [00:50<07:15, 15.59it/s]  
 10%|█         | 754/7540 [00:52<07:15, 15.59it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.31431815028190613, 'eval_f1': 0.5478194081051224, 'eval_precision': 0.6510244115082825, 'eval_recall': 0.4914098686250671, 'eval_runtime': 1.9263, 'eval_samples_per_second': 195.715, 'eval_steps_per_second': 49.318, 'epoch': 2.0}


 13%|█▎        | 1003/7540 [01:10<06:53, 15.83it/s] 

{'loss': 0.3275, 'grad_norm': 4.9528656005859375, 'learning_rate': 4.3368700265251996e-05, 'epoch': 2.65}


 15%|█▌        | 1131/7540 [01:18<07:05, 15.07it/s]
 15%|█▌        | 1131/7540 [01:20<07:05, 15.07it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.3665944039821625, 'eval_f1': 0.6564880165285845, 'eval_precision': 0.7659047619047619, 'eval_recall': 0.6051491852850888, 'eval_runtime': 1.7758, 'eval_samples_per_second': 212.296, 'eval_steps_per_second': 53.496, 'epoch': 3.0}


 20%|█▉        | 1507/7540 [01:45<06:46, 14.83it/s]  
 20%|██        | 1508/7540 [01:47<06:46, 14.83it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.38242968916893005, 'eval_f1': 0.6841592088917304, 'eval_precision': 0.7352328641065495, 'eval_recall': 0.6447353527227911, 'eval_runtime': 1.9185, 'eval_samples_per_second': 196.505, 'eval_steps_per_second': 49.517, 'epoch': 4.0}


 25%|██▌       | 1885/7540 [02:13<06:13, 15.13it/s]  
 25%|██▌       | 1885/7540 [02:15<06:13, 15.13it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.4002479612827301, 'eval_f1': 0.6854615894348205, 'eval_precision': 0.7164753692208168, 'eval_recall': 0.6589016502204817, 'eval_runtime': 1.9646, 'eval_samples_per_second': 191.897, 'eval_steps_per_second': 48.356, 'epoch': 5.0}


 27%|██▋       | 2003/7540 [02:24<06:05, 15.16it/s]

{'loss': 0.1288, 'grad_norm': 0.11996881663799286, 'learning_rate': 3.673740053050398e-05, 'epoch': 5.31}


 30%|██▉       | 2261/7540 [02:41<05:39, 15.56it/s]
 30%|███       | 2262/7540 [02:43<05:39, 15.56it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.4235920011997223, 'eval_f1': 0.729972644204251, 'eval_precision': 0.7682275672382312, 'eval_recall': 0.7272679058087039, 'eval_runtime': 1.9077, 'eval_samples_per_second': 197.62, 'eval_steps_per_second': 49.798, 'epoch': 6.0}


 35%|███▌      | 2639/7540 [03:08<05:43, 14.26it/s]
 35%|███▌      | 2639/7540 [03:10<05:43, 14.26it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.44108036160469055, 'eval_f1': 0.7414712256145148, 'eval_precision': 0.7580181468792233, 'eval_recall': 0.735972412764199, 'eval_runtime': 1.8161, 'eval_samples_per_second': 207.589, 'eval_steps_per_second': 52.31, 'epoch': 7.0}


 40%|███▉      | 3003/7540 [03:34<04:59, 15.12it/s]

{'loss': 0.0527, 'grad_norm': 2.209786891937256, 'learning_rate': 3.010610079575597e-05, 'epoch': 7.96}


 40%|███▉      | 3015/7540 [03:35<04:36, 16.34it/s]
 40%|████      | 3016/7540 [03:37<04:36, 16.34it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.5086205005645752, 'eval_f1': 0.7280527141053033, 'eval_precision': 0.7498147066932646, 'eval_recall': 0.718554971838853, 'eval_runtime': 1.9335, 'eval_samples_per_second': 194.979, 'eval_steps_per_second': 49.133, 'epoch': 8.0}


 45%|████▌     | 3393/7540 [04:03<04:22, 15.81it/s]
 45%|████▌     | 3393/7540 [04:04<04:22, 15.81it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.5082575082778931, 'eval_f1': 0.7256505005922247, 'eval_precision': 0.745104098769503, 'eval_recall': 0.7191404357459639, 'eval_runtime': 1.7847, 'eval_samples_per_second': 211.239, 'eval_steps_per_second': 53.23, 'epoch': 9.0}


 50%|████▉     | 3769/7540 [04:30<04:10, 15.05it/s]
 50%|█████     | 3770/7540 [04:32<04:10, 15.05it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.537062406539917, 'eval_f1': 0.7376594356626566, 'eval_precision': 0.7666442808840508, 'eval_recall': 0.7226485805511469, 'eval_runtime': 1.9916, 'eval_samples_per_second': 189.299, 'eval_steps_per_second': 47.701, 'epoch': 10.0}


 53%|█████▎    | 4003/7540 [04:48<03:49, 15.44it/s]

{'loss': 0.023, 'grad_norm': 0.12024925649166107, 'learning_rate': 2.347480106100796e-05, 'epoch': 10.61}


 55%|█████▌    | 4147/7540 [04:57<03:42, 15.24it/s]
 55%|█████▌    | 4147/7540 [04:59<03:42, 15.24it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.5510263442993164, 'eval_f1': 0.7448222777034015, 'eval_precision': 0.7480301082181506, 'eval_recall': 0.7459773906855504, 'eval_runtime': 1.9017, 'eval_samples_per_second': 198.247, 'eval_steps_per_second': 49.956, 'epoch': 11.0}


 60%|██████    | 4524/7540 [05:26<02:59, 16.81it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.5523668527603149, 'eval_f1': 0.7460632043674471, 'eval_precision': 0.7517827656976592, 'eval_recall': 0.7453286014074362, 'eval_runtime': 1.9667, 'eval_samples_per_second': 191.694, 'eval_steps_per_second': 48.305, 'epoch': 12.0}


 65%|██████▌   | 4901/7540 [05:52<02:51, 15.43it/s]
 65%|██████▌   | 4901/7540 [05:54<02:51, 15.43it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.5318892002105713, 'eval_f1': 0.7581772309734733, 'eval_precision': 0.7708691243627722, 'eval_recall': 0.7483342275426308, 'eval_runtime': 2.2008, 'eval_samples_per_second': 171.299, 'eval_steps_per_second': 43.166, 'epoch': 13.0}


 66%|██████▋   | 5003/7540 [06:02<02:39, 15.86it/s]

{'loss': 0.006, 'grad_norm': 0.007030825596302748, 'learning_rate': 1.6843501326259946e-05, 'epoch': 13.26}


 70%|██████▉   | 5277/7540 [06:19<02:15, 16.71it/s]
 70%|███████   | 5278/7540 [06:21<02:15, 16.71it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.5697777271270752, 'eval_f1': 0.7614326133781539, 'eval_precision': 0.7647028530229684, 'eval_recall': 0.762743994323939, 'eval_runtime': 1.9612, 'eval_samples_per_second': 192.23, 'eval_steps_per_second': 48.44, 'epoch': 14.0}


 75%|███████▌  | 5655/7540 [06:47<01:55, 16.36it/s]
 75%|███████▌  | 5655/7540 [06:49<01:55, 16.36it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.583881676197052, 'eval_f1': 0.7465231557664458, 'eval_precision': 0.7462415350792913, 'eval_recall': 0.760655949984284, 'eval_runtime': 1.9613, 'eval_samples_per_second': 192.224, 'eval_steps_per_second': 48.438, 'epoch': 15.0}


 80%|███████▉  | 6003/7540 [07:12<01:41, 15.15it/s]

{'loss': 0.0019, 'grad_norm': 0.017173318192362785, 'learning_rate': 1.0212201591511936e-05, 'epoch': 15.92}


 80%|███████▉  | 6031/7540 [07:14<01:38, 15.32it/s]
 80%|████████  | 6032/7540 [07:16<01:38, 15.32it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.5827566385269165, 'eval_f1': 0.76230708567207, 'eval_precision': 0.7688291301195332, 'eval_recall': 0.76340248278442, 'eval_runtime': 1.7788, 'eval_samples_per_second': 211.937, 'eval_steps_per_second': 53.406, 'epoch': 16.0}


 85%|████████▌ | 6409/7540 [07:41<01:07, 16.72it/s]
 85%|████████▌ | 6409/7540 [07:43<01:07, 16.72it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.6024753451347351, 'eval_f1': 0.7643092790955108, 'eval_precision': 0.763029528831164, 'eval_recall': 0.7715464652480735, 'eval_runtime': 1.5364, 'eval_samples_per_second': 245.386, 'eval_steps_per_second': 61.835, 'epoch': 17.0}


 90%|████████▉ | 6785/7540 [08:08<00:47, 15.74it/s]
 90%|█████████ | 6786/7540 [08:10<00:47, 15.74it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.6133421063423157, 'eval_f1': 0.7631699567887699, 'eval_precision': 0.7604318601637672, 'eval_recall': 0.7734284857579297, 'eval_runtime': 1.7116, 'eval_samples_per_second': 220.265, 'eval_steps_per_second': 55.505, 'epoch': 18.0}


 93%|█████████▎| 7003/7540 [08:25<00:32, 16.38it/s]

{'loss': 0.0009, 'grad_norm': 0.00629564980044961, 'learning_rate': 3.580901856763926e-06, 'epoch': 18.57}


 95%|█████████▌| 7163/7540 [08:35<00:23, 16.05it/s]
 95%|█████████▌| 7163/7540 [08:37<00:23, 16.05it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.6055001020431519, 'eval_f1': 0.769075036075036, 'eval_precision': 0.7666578469056345, 'eval_recall': 0.7767342708818967, 'eval_runtime': 1.6594, 'eval_samples_per_second': 227.192, 'eval_steps_per_second': 57.25, 'epoch': 19.0}


100%|█████████▉| 7539/7540 [09:02<00:00, 16.54it/s]
100%|██████████| 7540/7540 [09:06<00:00, 16.54it/s]


 ** Evaluating metrics for python test set. ** 

{'eval_loss': 0.6086220145225525, 'eval_f1': 0.7700501514609668, 'eval_precision': 0.7661173512235951, 'eval_recall': 0.7797656433302652, 'eval_runtime': 2.0143, 'eval_samples_per_second': 187.166, 'eval_steps_per_second': 47.164, 'epoch': 20.0}


100%|██████████| 7540/7540 [09:08<00:00, 13.74it/s]


{'train_runtime': 548.6492, 'train_samples_per_second': 54.935, 'train_steps_per_second': 13.743, 'train_loss': 0.07176546573797019, 'epoch': 20.0}


100%|██████████| 95/95 [00:01<00:00, 50.11it/s]



 ** Evaluating metrics for python test set. ** 

Evaluation Metrics: {'eval_loss': 0.6086220145225525, 'eval_f1': 0.7700501514609668, 'eval_precision': 0.7661173512235951, 'eval_recall': 0.7797656433302652, 'eval_runtime': 1.9312, 'eval_samples_per_second': 195.215, 'eval_steps_per_second': 49.192, 'epoch': 20.0}


0,1
eval/f1,▁▃▆▆▆▇▇▇▇▇▇▇██▇██████
eval/loss,▁▁▂▃▃▄▄▆▆▆▇▇▆▇▇▇█████
eval/precision,▁▄█▇▆██▇▇█▇▇██▇██████
eval/recall,▁▃▅▆▆▇▇▇▇▇▇▇▇████████
eval/runtime,▅▅▄▅▆▅▄▅▄▆▅▆█▅▅▄▁▃▂▆▅
eval/samples_per_second,▄▃▅▃▃▃▄▃▅▃▄▃▁▃▃▅█▆▆▂▃
eval/steps_per_second,▄▃▅▃▃▃▄▃▅▃▄▃▁▃▃▅█▆▆▂▃
train/epoch,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
train/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
train/grad_norm,█▁▄▁▁▁▁

0,1
eval/f1,0.77005
eval/loss,0.60862
eval/precision,0.76612
eval/recall,0.77977
eval/runtime,1.9312
eval/samples_per_second,195.215
eval/steps_per_second,49.192
total_flos,1982595203466240.0
train/epoch,20.0
train/global_step,7540.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1298/1298 [00:00<00:00, 9264.27 examples/s] 
  5%|▍         | 259/5200 [00:16<05:34, 14.77it/s]
  5%|▌         | 260/5200 [00:18<05:34, 14.77it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.3284994065761566, 'eval_f1': 0.1700392630625189, 'eval_precision': 0.18278514154802813, 'eval_recall': 0.1602828854314003, 'eval_runtime': 1.5538, 'eval_samples_per_second': 167.329, 'eval_steps_per_second': 41.832, 'epoch': 1.0}


 10%|▉         | 519/5200 [00:36<04:46, 16.34it/s]
 10%|█         | 520/5200 [00:37<04:46, 16.34it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2742178440093994, 'eval_f1': 0.43730830998093256, 'eval_precision': 0.5775168514894542, 'eval_recall': 0.3536028687083806, 'eval_runtime': 1.0006, 'eval_samples_per_second': 259.846, 'eval_steps_per_second': 64.961, 'epoch': 2.0}


 15%|█▍        | 779/5200 [00:55<04:33, 16.15it/s]
 15%|█▌        | 780/5200 [00:56<04:33, 16.15it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.25247976183891296, 'eval_f1': 0.5053718701135115, 'eval_precision': 0.5888730612749364, 'eval_recall': 0.4519598991459811, 'eval_runtime': 1.0207, 'eval_samples_per_second': 254.728, 'eval_steps_per_second': 63.682, 'epoch': 3.0}


 19%|█▉        | 1003/5200 [01:12<04:27, 15.69it/s]

{'loss': 0.2426, 'grad_norm': 3.796809673309326, 'learning_rate': 4.038461538461539e-05, 'epoch': 3.85}


 20%|█▉        | 1039/5200 [01:14<04:43, 14.66it/s]
 20%|██        | 1040/5200 [01:16<04:43, 14.66it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.23415492475032806, 'eval_f1': 0.5374782265560537, 'eval_precision': 0.5846937269346174, 'eval_recall': 0.5003078704012732, 'eval_runtime': 1.247, 'eval_samples_per_second': 208.501, 'eval_steps_per_second': 52.125, 'epoch': 4.0}


 25%|██▍       | 1299/5200 [01:33<03:58, 16.35it/s]
 25%|██▌       | 1300/5200 [01:35<03:58, 16.35it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2660548985004425, 'eval_f1': 0.5982116823337522, 'eval_precision': 0.8579046770223242, 'eval_recall': 0.548129908267011, 'eval_runtime': 1.016, 'eval_samples_per_second': 255.903, 'eval_steps_per_second': 63.976, 'epoch': 5.0}


 30%|██▉       | 1559/5200 [01:53<03:52, 15.69it/s]
 30%|███       | 1560/5200 [01:54<03:51, 15.69it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2561839818954468, 'eval_f1': 0.619842491065433, 'eval_precision': 0.8142149951269524, 'eval_recall': 0.5685861883032236, 'eval_runtime': 1.0471, 'eval_samples_per_second': 248.305, 'eval_steps_per_second': 62.076, 'epoch': 6.0}


 35%|███▍      | 1819/5200 [02:12<03:30, 16.03it/s]
 35%|███▌      | 1820/5200 [02:13<03:30, 16.03it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.29928508400917053, 'eval_f1': 0.6476077144524239, 'eval_precision': 0.8134973420771543, 'eval_recall': 0.5917589672215388, 'eval_runtime': 1.099, 'eval_samples_per_second': 236.569, 'eval_steps_per_second': 59.142, 'epoch': 7.0}


 39%|███▊      | 2003/5200 [02:26<03:20, 15.93it/s]

{'loss': 0.079, 'grad_norm': 0.06235584244132042, 'learning_rate': 3.0769230769230774e-05, 'epoch': 7.69}


 40%|███▉      | 2079/5200 [02:31<03:11, 16.32it/s]
 40%|████      | 2080/5200 [02:32<03:11, 16.32it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.27772489190101624, 'eval_f1': 0.6323591542115048, 'eval_precision': 0.7346266495829522, 'eval_recall': 0.5900319013550004, 'eval_runtime': 1.0506, 'eval_samples_per_second': 247.481, 'eval_steps_per_second': 61.87, 'epoch': 8.0}


 45%|████▍     | 2339/5200 [02:50<02:52, 16.60it/s]
 45%|████▌     | 2340/5200 [02:51<02:52, 16.60it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.291658490896225, 'eval_f1': 0.6695624058709856, 'eval_precision': 0.8426202571000284, 'eval_recall': 0.6117155659620627, 'eval_runtime': 0.9517, 'eval_samples_per_second': 273.188, 'eval_steps_per_second': 68.297, 'epoch': 9.0}


 50%|████▉     | 2599/5200 [03:10<02:45, 15.71it/s]
 50%|█████     | 2600/5200 [03:11<02:45, 15.71it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.27662259340286255, 'eval_f1': 0.6643192737950029, 'eval_precision': 0.7101606232200509, 'eval_recall': 0.6512573286718256, 'eval_runtime': 1.0484, 'eval_samples_per_second': 247.997, 'eval_steps_per_second': 61.999, 'epoch': 10.0}


 55%|█████▍    | 2859/5200 [03:29<02:29, 15.62it/s]
 55%|█████▌    | 2860/5200 [03:30<02:29, 15.62it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2779850363731384, 'eval_f1': 0.6886383602150836, 'eval_precision': 0.7844023487551952, 'eval_recall': 0.6617289733406918, 'eval_runtime': 1.331, 'eval_samples_per_second': 195.344, 'eval_steps_per_second': 48.836, 'epoch': 11.0}


 58%|█████▊    | 3003/5200 [03:41<02:19, 15.80it/s]

{'loss': 0.0297, 'grad_norm': 0.023033620789647102, 'learning_rate': 2.1153846153846154e-05, 'epoch': 11.54}


 60%|█████▉    | 3119/5200 [03:48<02:12, 15.76it/s]
 60%|██████    | 3120/5200 [03:49<02:12, 15.76it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2876938581466675, 'eval_f1': 0.6904528742788714, 'eval_precision': 0.7369242914073304, 'eval_recall': 0.6721341158729077, 'eval_runtime': 0.9933, 'eval_samples_per_second': 261.742, 'eval_steps_per_second': 65.435, 'epoch': 12.0}


 65%|██████▍   | 3379/5200 [04:07<01:50, 16.47it/s]
 65%|██████▌   | 3380/5200 [04:08<01:50, 16.47it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.2895984649658203, 'eval_f1': 0.6927580949766926, 'eval_precision': 0.7217236360826336, 'eval_recall': 0.6820227929962135, 'eval_runtime': 1.0534, 'eval_samples_per_second': 246.817, 'eval_steps_per_second': 61.704, 'epoch': 13.0}


 70%|██████▉   | 3639/5200 [04:26<01:42, 15.28it/s]
 70%|███████   | 3640/5200 [04:28<01:42, 15.28it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.30124932527542114, 'eval_f1': 0.6963134612819487, 'eval_precision': 0.7395186078685285, 'eval_recall': 0.6963008147833546, 'eval_runtime': 1.3039, 'eval_samples_per_second': 199.4, 'eval_steps_per_second': 49.85, 'epoch': 14.0}


 75%|███████▍  | 3899/5200 [04:46<01:16, 17.11it/s]
 75%|███████▌  | 3900/5200 [04:47<01:15, 17.11it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.30805206298828125, 'eval_f1': 0.6977983875960635, 'eval_precision': 0.8145287461076934, 'eval_recall': 0.6533615004437575, 'eval_runtime': 1.0124, 'eval_samples_per_second': 256.817, 'eval_steps_per_second': 64.204, 'epoch': 15.0}


 77%|███████▋  | 4001/5200 [04:55<01:22, 14.58it/s]

{'loss': 0.0129, 'grad_norm': 0.03353224694728851, 'learning_rate': 1.153846153846154e-05, 'epoch': 15.38}


 80%|███████▉  | 4159/5200 [05:05<01:06, 15.73it/s]
 80%|████████  | 4160/5200 [05:06<01:06, 15.73it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.3213150203227997, 'eval_f1': 0.6961023052728121, 'eval_precision': 0.8041675261534814, 'eval_recall': 0.6675839852422293, 'eval_runtime': 0.9484, 'eval_samples_per_second': 274.147, 'eval_steps_per_second': 68.537, 'epoch': 16.0}


 85%|████████▍ | 4419/5200 [05:24<00:50, 15.40it/s]
 85%|████████▌ | 4420/5200 [05:26<00:50, 15.40it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.3066445291042328, 'eval_f1': 0.6958731780030035, 'eval_precision': 0.7655540916410482, 'eval_recall': 0.6658785072464786, 'eval_runtime': 1.1871, 'eval_samples_per_second': 219.019, 'eval_steps_per_second': 54.755, 'epoch': 17.0}


 90%|████████▉ | 4679/5200 [05:43<00:31, 16.71it/s]
 90%|█████████ | 4680/5200 [05:45<00:31, 16.71it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.31284299492836, 'eval_f1': 0.6852872808781514, 'eval_precision': 0.7560805842401274, 'eval_recall': 0.6569347704932982, 'eval_runtime': 1.0947, 'eval_samples_per_second': 237.506, 'eval_steps_per_second': 59.376, 'epoch': 18.0}


 95%|█████████▍| 4939/5200 [06:03<00:16, 15.78it/s]
 95%|█████████▌| 4940/5200 [06:04<00:16, 15.78it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.31227636337280273, 'eval_f1': 0.6840104872367527, 'eval_precision': 0.7539069941728908, 'eval_recall': 0.6569347704932982, 'eval_runtime': 0.9397, 'eval_samples_per_second': 276.68, 'eval_steps_per_second': 69.17, 'epoch': 19.0}


 96%|█████████▌| 5003/5200 [06:09<00:12, 16.07it/s]

{'loss': 0.0048, 'grad_norm': 0.018788572400808334, 'learning_rate': 1.9230769230769234e-06, 'epoch': 19.23}


100%|█████████▉| 5199/5200 [06:22<00:00, 16.31it/s]
100%|██████████| 5200/5200 [06:25<00:00, 16.31it/s]


 ** Evaluating metrics for pharo test set. ** 

{'eval_loss': 0.3107616901397705, 'eval_f1': 0.6851150889818214, 'eval_precision': 0.7594355625951057, 'eval_recall': 0.6535334099490805, 'eval_runtime': 1.0347, 'eval_samples_per_second': 251.283, 'eval_steps_per_second': 62.821, 'epoch': 20.0}


100%|██████████| 5200/5200 [06:27<00:00, 13.42it/s]


{'train_runtime': 387.5378, 'train_samples_per_second': 53.569, 'train_steps_per_second': 13.418, 'train_loss': 0.07108631414862779, 'epoch': 20.0}


100%|██████████| 65/65 [00:01<00:00, 60.18it/s]



 ** Evaluating metrics for pharo test set. ** 

Evaluation Metrics: {'eval_loss': 0.30805206298828125, 'eval_f1': 0.6977983875960635, 'eval_precision': 0.8145287461076934, 'eval_recall': 0.6533615004437575, 'eval_runtime': 1.1183, 'eval_samples_per_second': 232.505, 'eval_steps_per_second': 58.126, 'epoch': 20.0}
    comparison                                     model     lan  \
0   validation  epoch-20_batchSize-4_weightsOfDecay-0.01    java   
1   validation  epoch-20_batchSize-4_weightsOfDecay-0.01    java   
2   validation  epoch-20_batchSize-4_weightsOfDecay-0.01    java   
3   validation  epoch-20_batchSize-4_weightsOfDecay-0.01    java   
4   validation  epoch-20_batchSize-4_weightsOfDecay-0.01    java   
5   validation  epoch-20_batchSize-4_weightsOfDecay-0.01    java   
6   validation  epoch-20_batchSize-4_weightsOfDecay-0.01    java   
7   validation  epoch-20_batchSize-4_weightsOfDecay-0.01  python   
8   validation  epoch-20_batchSize-4_weightsOfDecay-0.01  python   
9   

0,1
eval/f1,▁▅▅▆▇▇▇▇█████████████
eval/loss,█▄▂▁▃▃▆▄▅▄▄▅▅▆▆▇▆▇▇▇▆
eval/precision,▁▅▅▅███▇█▆▇▇▇▇█▇▇▇▇▇█
eval/recall,▁▄▅▅▆▆▇▇▇▇████▇██▇▇▇▇
eval/runtime,█▂▂▅▂▂▃▂▁▂▅▂▂▅▂▁▄▃▁▂▃
eval/samples_per_second,▁▇▇▄▇▆▅▆█▆▃▇▆▃▇█▄▅█▆▅
eval/steps_per_second,▁▇▇▄▇▆▅▆█▆▃▇▆▃▇█▄▅█▆▅
train/epoch,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇█████
train/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇█████
train/grad_norm,█▁▁▁▁

0,1
eval/f1,0.6978
eval/loss,0.30805
eval/precision,0.81453
eval/recall,0.65336
eval/runtime,1.1183
eval/samples_per_second,232.505
eval/steps_per_second,58.126
total_flos,1365607680768000.0
train/epoch,20.0
train/global_step,5200.0


  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 7614/7614 [00:00<00:00, 9994.93 examples/s] 
  5%|▌         | 762/15240 [00:56<17:46, 13.58it/s]
  5%|▌         | 762/15240 [01:00<17:46, 13.58it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.13600832223892212, 'eval_f1': 0.595752940682327, 'eval_precision': 0.6642296620530903, 'eval_recall': 0.5794466301167837, 'eval_runtime': 3.8513, 'eval_samples_per_second': 395.453, 'eval_steps_per_second': 49.594, 'epoch': 1.0}


  7%|▋         | 1002/15240 [01:19<18:34, 12.78it/s] 

{'loss': 0.137, 'grad_norm': 1.6239041090011597, 'learning_rate': 4.671916010498688e-05, 'epoch': 1.31}


 10%|█         | 1524/15240 [01:58<16:30, 13.85it/s]
 10%|█         | 1524/15240 [02:02<16:30, 13.85it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.08273111283779144, 'eval_f1': 0.7840341369452382, 'eval_precision': 0.8953788351224469, 'eval_recall': 0.7348765573944892, 'eval_runtime': 3.8679, 'eval_samples_per_second': 393.751, 'eval_steps_per_second': 49.38, 'epoch': 2.0}


 13%|█▎        | 2002/15240 [02:42<17:10, 12.85it/s]  

{'loss': 0.0764, 'grad_norm': 0.2853468060493469, 'learning_rate': 4.343832020997376e-05, 'epoch': 2.62}


 15%|█▌        | 2286/15240 [03:04<17:45, 12.16it/s]
 15%|█▌        | 2286/15240 [03:08<17:45, 12.16it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.07885777205228806, 'eval_f1': 0.8376249942828273, 'eval_precision': 0.8762725775821416, 'eval_recall': 0.8063444282200892, 'eval_runtime': 4.2352, 'eval_samples_per_second': 359.602, 'eval_steps_per_second': 45.098, 'epoch': 3.0}


 20%|█▉        | 3002/15240 [04:05<16:04, 12.69it/s]  

{'loss': 0.0528, 'grad_norm': 0.2716434597969055, 'learning_rate': 4.015748031496063e-05, 'epoch': 3.94}


 20%|██        | 3048/15240 [04:08<15:04, 13.48it/s]
 20%|██        | 3048/15240 [04:12<15:04, 13.48it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.09326793998479843, 'eval_f1': 0.8464513725066852, 'eval_precision': 0.889077349059263, 'eval_recall': 0.8152661186793833, 'eval_runtime': 3.9817, 'eval_samples_per_second': 382.5, 'eval_steps_per_second': 47.97, 'epoch': 4.0}


 25%|██▌       | 3810/15240 [05:13<13:44, 13.86it/s]  
 25%|██▌       | 3810/15240 [05:17<13:44, 13.86it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.08350060880184174, 'eval_f1': 0.8826813132708241, 'eval_precision': 0.9204711412246528, 'eval_recall': 0.8545443380587506, 'eval_runtime': 4.1366, 'eval_samples_per_second': 368.178, 'eval_steps_per_second': 46.173, 'epoch': 5.0}


 26%|██▋       | 4002/15240 [05:33<15:17, 12.25it/s]  

{'loss': 0.0311, 'grad_norm': 2.059251070022583, 'learning_rate': 3.6876640419947505e-05, 'epoch': 5.25}


 30%|███       | 4572/15240 [06:17<13:29, 13.18it/s]
 30%|███       | 4572/15240 [06:21<13:29, 13.18it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.10245852172374725, 'eval_f1': 0.8622236080338224, 'eval_precision': 0.8675926597954341, 'eval_recall': 0.8589727912247767, 'eval_runtime': 4.0177, 'eval_samples_per_second': 379.071, 'eval_steps_per_second': 47.539, 'epoch': 6.0}


 33%|███▎      | 5002/15240 [06:55<12:59, 13.13it/s]  

{'loss': 0.0215, 'grad_norm': 5.91235876083374, 'learning_rate': 3.3595800524934386e-05, 'epoch': 6.56}


 35%|███▌      | 5334/15240 [07:21<11:58, 13.79it/s]
 35%|███▌      | 5334/15240 [07:25<11:58, 13.79it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.10011924058198929, 'eval_f1': 0.8833959615145589, 'eval_precision': 0.9026522386262122, 'eval_recall': 0.8664516478315684, 'eval_runtime': 3.9879, 'eval_samples_per_second': 381.907, 'eval_steps_per_second': 47.895, 'epoch': 7.0}


 39%|███▉      | 6002/15240 [08:18<12:29, 12.32it/s]  

{'loss': 0.0177, 'grad_norm': 0.015838464722037315, 'learning_rate': 3.0314960629921263e-05, 'epoch': 7.87}


 40%|████      | 6096/15240 [08:25<10:55, 13.96it/s]
 40%|████      | 6096/15240 [08:29<10:55, 13.96it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.11453189700841904, 'eval_f1': 0.8694821776377111, 'eval_precision': 0.8715008088055678, 'eval_recall': 0.869969917172963, 'eval_runtime': 3.9152, 'eval_samples_per_second': 388.993, 'eval_steps_per_second': 48.784, 'epoch': 8.0}


 45%|████▌     | 6858/15240 [09:29<10:02, 13.92it/s]  
 45%|████▌     | 6858/15240 [09:33<10:02, 13.92it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.1237432062625885, 'eval_f1': 0.8693352639543674, 'eval_precision': 0.8712078316149654, 'eval_recall': 0.8690166760143494, 'eval_runtime': 3.9457, 'eval_samples_per_second': 385.987, 'eval_steps_per_second': 48.407, 'epoch': 9.0}


 46%|████▌     | 7002/15240 [09:45<10:20, 13.27it/s]  

{'loss': 0.01, 'grad_norm': 4.998196125030518, 'learning_rate': 2.7034120734908137e-05, 'epoch': 9.19}


 50%|█████     | 7620/15240 [10:32<09:12, 13.79it/s]
 50%|█████     | 7620/15240 [10:36<09:12, 13.79it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.1264461874961853, 'eval_f1': 0.8742071654032213, 'eval_precision': 0.8836204085022674, 'eval_recall': 0.8671039573256752, 'eval_runtime': 3.9877, 'eval_samples_per_second': 381.926, 'eval_steps_per_second': 47.898, 'epoch': 10.0}


 53%|█████▎    | 8002/15240 [11:07<09:42, 12.43it/s]  

{'loss': 0.0079, 'grad_norm': 0.2546742856502533, 'learning_rate': 2.3753280839895015e-05, 'epoch': 10.5}


 55%|█████▌    | 8382/15240 [11:36<08:38, 13.22it/s]
 55%|█████▌    | 8382/15240 [11:41<08:38, 13.22it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.12206801027059555, 'eval_f1': 0.8603397556627994, 'eval_precision': 0.8712284034346544, 'eval_recall': 0.851916513386044, 'eval_runtime': 4.1807, 'eval_samples_per_second': 364.293, 'eval_steps_per_second': 45.686, 'epoch': 11.0}


 59%|█████▉    | 9002/15240 [12:30<08:02, 12.94it/s]  

{'loss': 0.0068, 'grad_norm': 0.05760088562965393, 'learning_rate': 2.0472440944881892e-05, 'epoch': 11.81}


 60%|██████    | 9144/15240 [12:41<07:18, 13.92it/s]
 60%|██████    | 9144/15240 [12:45<07:18, 13.92it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.12837813794612885, 'eval_f1': 0.8694309512235999, 'eval_precision': 0.8864430369369559, 'eval_recall': 0.8553091120519923, 'eval_runtime': 4.0071, 'eval_samples_per_second': 380.073, 'eval_steps_per_second': 47.665, 'epoch': 12.0}


 65%|██████▌   | 9906/15240 [13:45<06:22, 13.94it/s]  
 65%|██████▌   | 9906/15240 [13:49<06:22, 13.94it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.1253124326467514, 'eval_f1': 0.8749734822056999, 'eval_precision': 0.897222875803936, 'eval_recall': 0.8577733490988496, 'eval_runtime': 3.9722, 'eval_samples_per_second': 383.416, 'eval_steps_per_second': 48.084, 'epoch': 13.0}


 66%|██████▌   | 10002/15240 [13:58<06:36, 13.23it/s] 

{'loss': 0.0041, 'grad_norm': 0.011028129607439041, 'learning_rate': 1.7191601049868766e-05, 'epoch': 13.12}


 70%|███████   | 10668/15240 [14:48<05:41, 13.38it/s]
 70%|███████   | 10668/15240 [14:52<05:41, 13.38it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.12527745962142944, 'eval_f1': 0.876565876487294, 'eval_precision': 0.91037315704353, 'eval_recall': 0.8517746087309381, 'eval_runtime': 3.8918, 'eval_samples_per_second': 391.332, 'eval_steps_per_second': 49.077, 'epoch': 14.0}


 72%|███████▏  | 11002/15240 [15:19<05:19, 13.25it/s]  

{'loss': 0.0031, 'grad_norm': 0.0015589578542858362, 'learning_rate': 1.3910761154855645e-05, 'epoch': 14.44}


 75%|███████▌  | 11430/15240 [15:51<04:54, 12.92it/s]
 75%|███████▌  | 11430/15240 [15:55<04:54, 12.92it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.12203247845172882, 'eval_f1': 0.8754698643532567, 'eval_precision': 0.8866283451257138, 'eval_recall': 0.8657301467600874, 'eval_runtime': 4.0764, 'eval_samples_per_second': 373.618, 'eval_steps_per_second': 46.856, 'epoch': 15.0}


 79%|███████▊  | 12001/15240 [16:40<04:20, 12.42it/s]  

{'loss': 0.003, 'grad_norm': 0.008200361393392086, 'learning_rate': 1.062992125984252e-05, 'epoch': 15.75}


 80%|███████▉  | 12191/15240 [16:55<03:51, 13.18it/s]
 80%|████████  | 12192/15240 [16:59<03:51, 13.18it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.1282494068145752, 'eval_f1': 0.8863374424065347, 'eval_precision': 0.8947534470804962, 'eval_recall': 0.8799501744697215, 'eval_runtime': 3.912, 'eval_samples_per_second': 389.312, 'eval_steps_per_second': 48.824, 'epoch': 16.0}


 85%|████████▌ | 12954/15240 [17:58<02:59, 12.73it/s]
 85%|████████▌ | 12954/15240 [18:03<02:59, 12.73it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.1278945654630661, 'eval_f1': 0.8847662641649439, 'eval_precision': 0.9110718386010735, 'eval_recall': 0.8670562618296404, 'eval_runtime': 4.105, 'eval_samples_per_second': 371.007, 'eval_steps_per_second': 46.528, 'epoch': 17.0}


 85%|████████▌ | 13002/15240 [18:08<02:50, 13.10it/s]

{'loss': 0.0025, 'grad_norm': 0.003275152761489153, 'learning_rate': 7.349081364829396e-06, 'epoch': 17.06}


 90%|█████████ | 13716/15240 [19:02<01:53, 13.48it/s]
 90%|█████████ | 13716/15240 [19:05<01:53, 13.48it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.13323579728603363, 'eval_f1': 0.8763330135297088, 'eval_precision': 0.8804588713301351, 'eval_recall': 0.8739074791687143, 'eval_runtime': 3.9166, 'eval_samples_per_second': 388.853, 'eval_steps_per_second': 48.766, 'epoch': 18.0}


 92%|█████████▏| 14002/15240 [19:28<01:33, 13.19it/s]

{'loss': 0.0014, 'grad_norm': 0.0008577531552873552, 'learning_rate': 4.068241469816273e-06, 'epoch': 18.37}


 95%|█████████▌| 14478/15240 [20:04<00:56, 13.42it/s]
 95%|█████████▌| 14478/15240 [20:08<00:56, 13.42it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.1325462907552719, 'eval_f1': 0.8820201445988144, 'eval_precision': 0.8900449954373248, 'eval_recall': 0.875929923511794, 'eval_runtime': 4.1481, 'eval_samples_per_second': 367.155, 'eval_steps_per_second': 46.045, 'epoch': 19.0}


 98%|█████████▊| 15002/15240 [20:49<00:17, 13.29it/s]

{'loss': 0.0013, 'grad_norm': 0.004687155596911907, 'learning_rate': 7.874015748031496e-07, 'epoch': 19.69}


100%|██████████| 15240/15240 [21:07<00:00, 13.97it/s]
100%|██████████| 15240/15240 [21:12<00:00, 13.97it/s]


 ** Evaluating metrics for java test set. ** 

{'eval_loss': 0.1330864578485489, 'eval_f1': 0.8802168728705781, 'eval_precision': 0.8914786291378555, 'eval_recall': 0.8708136213164771, 'eval_runtime': 3.9217, 'eval_samples_per_second': 388.355, 'eval_steps_per_second': 48.704, 'epoch': 20.0}


100%|██████████| 15240/15240 [21:14<00:00, 11.96it/s]


{'train_runtime': 1274.6978, 'train_samples_per_second': 95.568, 'train_steps_per_second': 11.956, 'train_loss': 0.024718011516755022, 'epoch': 20.0}


100%|██████████| 191/191 [00:03<00:00, 50.23it/s]



 ** Evaluating metrics for java test set. ** 

Evaluation Metrics: {'eval_loss': 0.1282494068145752, 'eval_f1': 0.8863374424065347, 'eval_precision': 0.8947534470804962, 'eval_recall': 0.8799501744697215, 'eval_runtime': 3.8528, 'eval_samples_per_second': 395.294, 'eval_steps_per_second': 49.574, 'epoch': 20.0}


SafetensorError: Error while serializing: IoError(Os { code: 112, kind: StorageFull, message: "There is not enough space on the disk." })

Evaluating the predictions.

In [15]:
scores

Unnamed: 0,comparison,model,lan,cat,precision,recall,f1
0,validation,epoch-20_batchSize-4_weightsOfDecay-0.01,python,Usage,0.897959,0.727273,0.803653
1,validation,epoch-20_batchSize-4_weightsOfDecay-0.01,python,Parameters,0.852174,0.875,0.863436
2,validation,epoch-20_batchSize-4_weightsOfDecay-0.01,python,DevelopmentNotes,0.681818,0.75,0.714286
3,validation,epoch-20_batchSize-4_weightsOfDecay-0.01,python,Expand,0.657895,0.724638,0.689655
4,validation,epoch-20_batchSize-4_weightsOfDecay-0.01,python,Summary,0.740741,0.821918,0.779221
5,validation,epoch-20_batchSize-4_weightsOfDecay-0.01,pharo,Keyimplementationpoints,0.815789,0.738095,0.775
6,validation,epoch-20_batchSize-4_weightsOfDecay-0.01,pharo,Example,0.926316,0.871287,0.897959
7,validation,epoch-20_batchSize-4_weightsOfDecay-0.01,pharo,Responsibilities,0.711111,0.64,0.673684
8,validation,epoch-20_batchSize-4_weightsOfDecay-0.01,pharo,Classreferences,1.0,0.25,0.4
9,validation,epoch-20_batchSize-4_weightsOfDecay-0.01,pharo,Intent,0.939394,0.815789,0.873239


In [16]:
best_f1_scores = []
best_f1_java_row = scores[scores['lan'] == 'java'].loc[scores[scores['lan'] == 'java']['f1'].idxmax()]
best_f1_scores.append(best_f1_java_row)

best_f1_python_row = scores[scores['lan'] == 'python'].loc[scores[scores['lan'] == 'python']['f1'].idxmax()]
best_f1_scores.append(best_f1_python_row)

best_f1_pharo_row = scores[scores['lan'] == 'pharo'].loc[scores[scores['lan'] == 'pharo']['f1'].idxmax()]
best_f1_scores.append(best_f1_pharo_row)
best_f1_scores

[comparison                                  validation
 model         epoch-20_batchSize-8_weightsOfDecay-0.01
 lan                                               java
 cat                                          Ownership
 precision                                     0.947368
 recall                                        0.981818
 f1                                            0.964286
 Name: 13, dtype: object,
 comparison                                  validation
 model         epoch-20_batchSize-4_weightsOfDecay-0.01
 lan                                             python
 cat                                         Parameters
 precision                                     0.852174
 recall                                           0.875
 f1                                            0.863436
 Name: 1, dtype: object,
 comparison                                  validation
 model         epoch-20_batchSize-4_weightsOfDecay-0.01
 lan                                              pha

In [21]:
gs_metrics_dict

{'java_epoch-5_batchSize-4_weightsOfDecay-0.01': {'lang': 'java',
  'model': 'epoch-5_batchSize-4_weightsOfDecay-0.01',
  'avg_runtime': 77.52191851139068,
  'avg_flops': 34044.01424896312,
  'total_flops': 340440.1424896312,
  'total_time': 775.2191851139069,
  'avg_f1': nan,
  'succ_score': nan},
 'python_epoch-5_batchSize-4_weightsOfDecay-0.01': {'lang': 'python',
  'model': 'epoch-5_batchSize-4_weightsOfDecay-0.01',
  'avg_runtime': 19.481082320213318,
  'avg_flops': 8427.177680895456,
  'total_flops': 84271.77680895456,
  'total_time': 194.81082320213318,
  'avg_f1': nan,
  'succ_score': nan},
 'pharo_epoch-5_batchSize-4_weightsOfDecay-0.01': {'lang': 'pharo',
  'model': 'epoch-5_batchSize-4_weightsOfDecay-0.01',
  'avg_runtime': 13.52945659160614,
  'avg_flops': 5811.847475200231,
  'total_flops': 58118.47475200231,
  'total_time': 135.2945659160614,
  'avg_f1': nan,
  'succ_score': nan},
 'java_epoch-5_batchSize-8_weightsOfDecay-0.01': {'lang': 'java',
  'model': 'epoch-5_batchS

In [22]:
final_model_stats

{'epoch-5_batchSize-4_weightsOfDecay-0.01': -5.51,
 'epoch-5_batchSize-8_weightsOfDecay-0.01': -5.67,
 'epoch-10_batchSize-4_weightsOfDecay-0.01': -5.8,
 'epoch-10_batchSize-8_weightsOfDecay-0.01': -5.95,
 'epoch-15_batchSize-4_weightsOfDecay-0.01': -6.02,
 'epoch-15_batchSize-8_weightsOfDecay-0.01': -6.05,
 'epoch-20_batchSize-4_weightsOfDecay-0.01': -6.07}

In [20]:
best_fin_succ_score = max(final_model_stats.values())
best_fin_succ_score

-5.51