In [1]:
!mkdir -p /scratch/sagarsj42/torch-cache
!mkdir -p /scratch/sagarsj42/transformers

import os
os.chdir('/scratch/sagarsj42')
os.environ['TORCH_HOME'] = '/scratch/sagarsj42/torch-cache'
os.environ['TRANSFORMERS_CACHE'] = '/scratch/sagarsj42/transformers'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [55]:
from functools import partial

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, load_dataset, load_metric

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments

In [3]:
TRAIN_DATA_FILE = 'irse.train.csv'
DEV_DATA_FILE = 'irse.dev.csv'
TEST_DATA_FILE = 'irse.test.csv'
GIVEN_TEST_FILE = 'irse.given-test.csv'
MODEL_KEY = 'microsoft/codebert-base'
EXP_NAME = 'irse-codebert'

TRAIN_BATCH_SIZE = 4
EVAL_BATCH_SIZE = 16
ACCUMULATE_GRAD_STEPS = 2
N_EPOCHS = 5
LEARNING_RATE = 6e-5
SCHEDULER_TYPE = 'cosine'
LR_WARMUP_RATIO = 0.4
LOG_STEPS = 50
SEED = 43419

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_KEY)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_KEY)

tokenizer, model

Some weights of the model checkpoint at microsoft/codebert-base were not used when initializing RobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be 

(PreTrainedTokenizerFast(name_or_path='microsoft/codebert-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}),
 RobertaForSequenceClassification(
   (roberta): RobertaModel(
     (embeddings): RobertaEmbeddings(
       (word_embeddings): Embedding(50265, 768, padding_idx=1)
       (position_embeddings): Embedding(514, 768, padding_idx=1)
       (token_type_embeddings): Embedding(1, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): RobertaEncoder(
       (layer): ModuleList(
         (0): RobertaLayer(
           (attention): RobertaAttention(
             (self): RobertaSelfAttention(
            

In [5]:
data_files = {
    'train': TRAIN_DATA_FILE,
    'dev': DEV_DATA_FILE,
    'test': TEST_DATA_FILE,
    'giventest': GIVEN_TEST_FILE
}
ds = load_dataset('csv', data_files=data_files)

ds

Using custom data configuration default-45d70fe36903500c
Reusing dataset csv (/home2/sagarsj42/.cache/huggingface/datasets/csv/default-45d70fe36903500c/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/4 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label'],
        num_rows: 5354
    })
    dev: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label'],
        num_rows: 595
    })
    test: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label'],
        num_rows: 678
    })
    giventest: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label'],
        num_rows: 1001
    })
})

In [32]:
def tokenizer_func(batch, tokenizer):
    text = [batch['Comments'][i] + tokenizer.sep_token + batch['Surrounding Code Context'][i] 
        for i in range(len(batch['Comments']))]
    tok = tokenizer(text, max_length=tokenizer.model_max_length, truncation=True, 
        padding=True, return_attention_mask=True)

    return tok

In [50]:
tokenizer_partial = partial(tokenizer_func, tokenizer=tokenizer)
ds_tok = dict()

ds_tok['train'] = ds['train'].map(tokenizer_partial, batched=True, batch_size=TRAIN_BATCH_SIZE, 
    load_from_cache_file=False)
ds_tok['dev'] = ds['dev'].map(tokenizer_partial, batched=True, batch_size=EVAL_BATCH_SIZE, 
    load_from_cache_file=False)
ds_tok['test'] = ds['test'].map(tokenizer_partial, batched=True, batch_size=EVAL_BATCH_SIZE, 
    load_from_cache_file=False)
ds_tok['giventest'] = ds['giventest'].map(tokenizer_partial, batched=True, batch_size=EVAL_BATCH_SIZE, 
    load_from_cache_file=False)
ds_tok = DatasetDict(ds_tok)

ds_tok

  0%|          | 0/1339 [00:00<?, ?ba/s]

  0%|          | 0/38 [00:00<?, ?ba/s]

  0%|          | 0/43 [00:00<?, ?ba/s]

  0%|          | 0/63 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5354
    })
    dev: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label', 'input_ids', 'attention_mask'],
        num_rows: 595
    })
    test: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label', 'input_ids', 'attention_mask'],
        num_rows: 678
    })
    giventest: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1001
    })
})

In [51]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    scores = logits[:, -1]
    
    acc_metric = load_metric('accuracy')
    f1_metric = load_metric('f1')
    mcc_metric = load_metric('matthews_correlation')
    roc_metric = load_metric('roc_auc')

    metrics_dict = {
        'accuracy': acc_metric.compute(predictions=predictions, references=labels)['accuracy'],
        'f1_score': f1_metric.compute(predictions=predictions, references=labels)['f1'],
        'matthews_cc': mcc_metric.compute(predictions=predictions, references=labels)['matthews_correlation'],
        'roc_auc_score': roc_metric.compute(prediction_scores=scores, references=labels)['roc_auc'],
    }

    return metrics_dict

In [52]:
training_args = TrainingArguments(
    max_steps=100,
    output_dir=EXP_NAME,
    run_name=EXP_NAME,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=ACCUMULATE_GRAD_STEPS,
    num_train_epochs=N_EPOCHS,
    learning_rate=LEARNING_RATE,
    lr_scheduler_type=SCHEDULER_TYPE,
    warmup_ratio=LR_WARMUP_RATIO,
    evaluation_strategy='epoch',
    logging_steps=LOG_STEPS,
    save_strategy='epoch',
    metric_for_best_model='f1_score',
    greater_is_better=True,
    load_best_model_at_end=True,
    no_cuda=False,
    seed=SEED,
    fp16=False,
    dataloader_drop_last=False
)

training_args

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=IntervalStrategy.EPOCH,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=2,
gradient_checkpointing=False,
greater_is_better=True,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=6e-05,
length_column_name=length,
load_best_model_at_end=True,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logg

In [56]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='microsoft/codebert-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [57]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=ds_tok['train'],
    eval_dataset=ds_tok['dev'],
    compute_metrics=compute_metrics
)

trainer

max_steps is given, it will override any value given in num_train_epochs


<transformers.trainer.Trainer at 0x7fe3329f90d0>

In [65]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Surrounding Code Context, Comments, Class. If Surrounding Code Context, Comments, Class are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5354
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 100


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score,Matthews Cc,Roc Auc Score
0,0.0521,0.014107,0.998319,0.998523,0.996579,1.0


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Surrounding Code Context, Comments, Class. If Surrounding Code Context, Comments, Class are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 595
  Batch size = 16
Saving model checkpoint to irse-codebert/checkpoint-100
Configuration saved in irse-codebert/checkpoint-100/config.json
Model weights saved in irse-codebert/checkpoint-100/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from irse-codebert/checkpoint-100 (score: 0.9985228951255538).


TrainOutput(global_step=100, training_loss=0.05671430110931396, metrics={'train_runtime': 51.1048, 'train_samples_per_second': 15.654, 'train_steps_per_second': 1.957, 'total_flos': 200001402065760.0, 'train_loss': 0.05671430110931396, 'epoch': 0.15})

In [66]:
preds, labels, metrics = trainer.predict(ds_tok['test'])

preds.shape, labels.shape, metrics

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Surrounding Code Context, Comments, Class. If Surrounding Code Context, Comments, Class are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 678
  Batch size = 16


((678, 2),
 (678,),
 {'test_loss': 0.03539285436272621,
  'test_accuracy': 0.995575221238938,
  'test_f1_score': 0.9944954128440368,
  'test_matthews_cc': 0.9908009595277535,
  'test_roc_auc_score': 0.9997648442092887,
  'test_runtime': 15.2906,
  'test_samples_per_second': 44.341,
  'test_steps_per_second': 2.812})

In [67]:
preds, labels, metrics = trainer.predict(ds_tok['giventest'])

preds.shape, labels.shape, metrics

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Surrounding Code Context, Comments, Class. If Surrounding Code Context, Comments, Class are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1001
  Batch size = 16


((1001, 2),
 (1001,),
 {'test_loss': 0.024106988683342934,
  'test_accuracy': 0.997002997002997,
  'test_f1_score': 0.9946714031971581,
  'test_matthews_cc': 0.9925896618531443,
  'test_roc_auc_score': 0.9998471083755018,
  'test_runtime': 18.3678,
  'test_samples_per_second': 54.498,
  'test_steps_per_second': 3.43})