In [1]:
%env WANDB_PROJECT=aspect_explain_root_cause_770m

env: WANDB_PROJECT=aspect_explain_root_cause_770m


In [2]:
from project_dataset import load_dataset


In [3]:
from dataclasses import dataclass

@dataclass
class Args:
    model_name = "Salesforce/codet5p-770m"
    num_proc = 4
    batch_size = 2
    max_src_length = 1200
    max_des_length = 153
    data_cols = ["CVE ID", "explain", "func_before", "processed_func"]
    save_dir = 'tf_board'
    epochs = 5
    grad_acc_steps = 4
    lr = 5e-5
    log_freq = 10
    local_rank = -1
    deepspeed = None
    fp16 = True
    lr_warmup_steps = 200
    weight_decay = 0.05
    task = "root_cause"
    prefix = "770m"
    
args = Args()

In [4]:
ds = load_dataset(args.task)

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['CVE ID', 'explain', 'func_before', 'processed_func'],
        num_rows: 3431
    })
    validation: Dataset({
        features: ['CVE ID', 'explain', 'func_before', 'processed_func'],
        num_rows: 382
    })
    test: Dataset({
        features: ['CVE ID', 'explain', 'func_before', 'processed_func'],
        num_rows: 954
    })
})

In [6]:
import os

os.makedirs(args.save_dir, exist_ok=True)

In [7]:
from transformers import AutoTokenizer
codet5p_tokenizer = AutoTokenizer.from_pretrained(args.model_name)

In [8]:
import numpy as np

def preprocess_function(examples):
    source = [' '.join(ex) for ex in examples["func_before"]]
    target = [' '.join(ex) for ex in examples["explain"]]

    input_feature = codet5p_tokenizer(source, max_length=args.max_src_length, padding="max_length", truncation=True)
    labels = codet5p_tokenizer(target, max_length=args.max_des_length, padding="max_length", truncation=True)

    lables = labels["input_ids"].copy()
    # lables = np.where(lables != codet5p_tokenizer.pad_token_id, lables, -100)

    return {  "input_ids": input_feature["input_ids"],
              "attention_mask": input_feature["attention_mask"],
              "labels": lables}


tokenized_ds = ds.map(
  preprocess_function,
  remove_columns=args.data_cols,
  batched=True,
  num_proc=args.num_proc,
  batch_size=args.batch_size)

tokenized_ds

Map (num_proc=4):   0%|          | 0/3431 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/382 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/954 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3431
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 382
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 954
    })
})

In [9]:
import torch
from transformers import AutoModelForSeq2SeqLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name).to(device)

In [10]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
  codet5p_tokenizer,
  model=model,
  return_tensors="pt")

In [11]:
import evaluate
import numpy as np

rouge_metric = evaluate.load("rouge")

# define function for custom tokenization
def tokenize_sentence(arg):
  encoded_arg = codet5p_tokenizer(arg)
  return codet5p_tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

# define function to get ROUGE scores with custom tokenization
def metrics_func(eval_arg):
  preds, labels = eval_arg
  # Replace -100
  labels = np.where(labels != -100, labels, codet5p_tokenizer.pad_token_id)
  # Convert id tokens to text
  text_preds = codet5p_tokenizer.batch_decode(preds, skip_special_tokens=True)
  text_labels = codet5p_tokenizer.batch_decode(labels, skip_special_tokens=True)
  # Insert a line break (\n) in each sentence for ROUGE scoring
  text_preds = ["\n".join(p for p in text_preds)]
  text_labels = ["\n".join(l for l in text_labels)]
  # compute ROUGE score with custom tokenization
  return rouge_metric.compute(
    predictions=text_preds,
    references=text_labels,
    tokenizer=tokenize_sentence
  )

In [12]:
# for testing, show first 5 rows

from torch.utils.data import DataLoader

sample_dataloader = DataLoader(
  tokenized_ds["test"].with_format("torch"),
  collate_fn=data_collator,
  batch_size=5)

for batch in sample_dataloader:
  with torch.no_grad():
    preds = model.generate(
      batch["input_ids"].to(device),
      max_length=args.max_des_length,
    ).cpu()
  labels = batch["labels"].cpu()
  break

metrics_func([preds, labels])

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Token indices sequence length is longer than the specified maximum sequence length for this model (764 > 512). Running this sequence through the model will result in indexing errors


{'rouge1': 0.4471057884231536,
 'rouge2': 0.132,
 'rougeL': 0.20958083832335328,
 'rougeLsum': 0.46028037383177567}

In [13]:
preds.shape, labels.shape

(torch.Size([5, 153]), torch.Size([5, 153]))

In [14]:
from transformers import TrainingArguments

training_args = TrainingArguments(
        report_to='wandb',
        output_dir=args.save_dir,
        overwrite_output_dir=False,

        do_train=True,
        save_strategy='epoch',
        do_eval=True,

        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.batch_size,
        gradient_accumulation_steps=args.grad_acc_steps,

        learning_rate=args.lr,
        weight_decay=args.weight_decay,
        warmup_steps=args.lr_warmup_steps,

        logging_dir=args.save_dir,
        logging_first_step=True,
        logging_steps=args.log_freq,
        save_total_limit=1,

        dataloader_drop_last=True,
        dataloader_num_workers=args.num_proc,

        local_rank=args.local_rank,
        deepspeed=args.deepspeed,
        fp16=args.fp16,
    )

In [None]:
from transformers import Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    compute_metrics = metrics_func,
    train_dataset = tokenized_ds["train"],
    eval_dataset = tokenized_ds["validation"],
    tokenizer = codet5p_tokenizer,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mdongchirua[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
1,9.3343
10,8.9542
20,7.8111
30,6.985
40,5.9761
50,5.1689
60,4.7436
70,4.459
80,4.205
90,4.1073




In [16]:
trainer.save_model(f'{args.save_dir}/{args.prefix}/{args.task}')

## measure on test set

In [24]:
from pathlib import Path
model.load_state_dict(torch.load(Path(f'{args.save_dir}/{args.prefix}/{args.task}')/"pytorch_model.bin"))

<All keys matched successfully>

In [25]:
sample_dataloader = DataLoader(
  tokenized_ds["test"].with_format("torch"),
  collate_fn=data_collator,
  batch_size=50,
  num_workers=args.num_proc)

rouge_list = []

for batch in sample_dataloader:
  with torch.no_grad():
    preds = model.generate(
      batch["input_ids"].to(device),
      max_length=args.max_des_length,
    ).cpu()
  labels = batch["labels"].cpu()

  rouge_result = metrics_func([preds, labels])
  print(rouge_result)
  rouge_list.append(rouge_result)

{'rouge1': 0.9092395748160261, 'rouge2': 0.770961145194274, 'rougeL': 0.5343417825020441, 'rougeLsum': 0.9070140280561122}
{'rouge1': 0.8951804625027014, 'rouge2': 0.7744864864864864, 'rougeL': 0.5632159066349687, 'rougeLsum': 0.8960846560846562}
{'rouge1': 0.9270286047869236, 'rouge2': 0.7899552267860619, 'rougeL': 0.5806577155088539, 'rougeLsum': 0.927248424670613}
{'rouge1': 0.9463270890338875, 'rouge2': 0.8313329121920404, 'rougeL': 0.6167122711008208, 'rougeLsum': 0.9453495566096101}
{'rouge1': 0.9023917259211377, 'rouge2': 0.7661133865057124, 'rougeL': 0.5662572721396251, 'rougeLsum': 0.9018780333403671}
{'rouge1': 0.9420610349402919, 'rouge2': 0.8039823008849558, 'rougeL': 0.5824856258292791, 'rougeLsum': 0.9396234581259468}
{'rouge1': 0.9487072560467057, 'rouge2': 0.8268669169795578, 'rougeL': 0.6088407005838198, 'rougeLsum': 0.9476910502656314}
{'rouge1': 0.9327887114741924, 'rouge2': 0.8053491827637445, 'rougeL': 0.5822502784998144, 'rougeLsum': 0.9307075127644056}
{'rouge1':

In [19]:
print(codet5p_tokenizer.batch_decode(preds[0], skip_special_tokens=True))

['', '', 'd', ' o', ' e', ' s', '  ', ' n', ' o', ' t', '  ', ' p', ' r', ' o', ' p', ' e', ' r', ' l', ' y', '  ', ' v', ' a', ' l', ' i', ' d', ' a', ' t', ' e', '  ', ' f', ' i', ' l', ' e', '  ', ' n', ' a', ' m', ' e', ' s', '  ', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']


In [20]:
print(codet5p_tokenizer.batch_decode(labels[0], skip_special_tokens=True))

['', 'd', ' o', ' e', ' s', '  ', ' n', ' o', ' t', '  ', ' p', ' r', ' o', ' p', ' e', ' r', ' l', ' y', '  ', ' v', ' a', ' l', ' i', ' d', ' a', ' t', ' e', '  ', ' f', ' i', ' l', ' e', '  ', ' n', ' a', ' m', ' e', ' s', '  ', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']


In [27]:
rouge1_values = [d['rouge1'] for d in rouge_list]
rouge2_values = [d['rouge2'] for d in rouge_list]
rougeL_values = [d['rougeL'] for d in rouge_list]
rougeLsum_values = [d['rougeLsum'] for d in rouge_list]
rouge1_values = sum(rouge1_values) / len(rougeL_values)
average_rouge2 = sum(rouge2_values) / len(rougeL_values)
average_rougeL = sum(rougeL_values) / len(rougeL_values)
average_rougeLsum_values = sum(rougeLsum_values) / len(rougeL_values)

rouge1_values, average_rouge2, average_rougeL, average_rougeLsum_values

(0.9137331860292488,
 0.7810573835453931,
 0.5750201565338785,
 0.9096206535688932)