In [1]:
# from datasets import load_dataset

# ds = load_dataset("csebuetnlp/xlsum", name="english")
# ds

from datasets import load_from_disk

def load_dataset(task):
    task_to_path = {
        "vulnerability_type": "./aspect_bigvul/dataset_vulnerability_type",
        "root_cause": "./aspect_bigvul/dataset_root_cause",
        "attack_vector": "./aspect_bigvul/dataset_attack_vector",
        "impact": "./aspect_bigvul/dataset_impact",
    }
    return load_from_disk(task_to_path[task])

# Usage:
ds = load_dataset("vulnerability_type")


In [2]:
ds["train"][0]

{'CVE ID': 'CVE-2018-16066',
 'explain': 'out-of-bounds read',
 'func_before': ' Node::InsertionNotificationRequest SVGStyleElement::InsertedInto(  ContainerNode* insertion_point) {  SVGElement::InsertedInto(insertion_point);  return kInsertionShouldCallDidNotifySubtreeInsertions; } '}

In [3]:
ds["test"][0]

{'CVE ID': 'CVE-2017-7889',
 'explain': 'memory corruption',
 'func_before': "static ssize_t read_mem(struct file *file, char __user *buf,    size_t count, loff_t *ppos) {  phys_addr_t p = *ppos;  ssize_t read, sz;  void *ptr;   if (p != *ppos)   return 0;   if (!valid_phys_addr_range(p, count))   return -EFAULT;  read = 0; #ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED  /* we don't have page 0 mapped on sparc and m68k.. */  if (p < PAGE_SIZE) {   sz = size_inside_page(p, count);   if (sz > 0) {    if (clear_user(buf, sz))     return -EFAULT;    buf += sz;    p += sz;    count -= sz;    read += sz;   }  } #endif  while (count > 0) {  unsigned long remaining;  sz = size_inside_page(p, count);  if (!range_is_allowed(p >> PAGE_SHIFT, count))  return -EPERM;  /*    * On ia64 if a page has been mapped somewhere as uncached, then    * it must also be accessed uncached by the kernel or data    * corruption may occur.    */   ptr = xlate_dev_mem_ptr(p);   if (!ptr)    return -EFAULT;  remaining = copy_

In [4]:
from transformers import AutoTokenizer
t5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")



In [5]:
def tokenize_sample_data(data):
  input_feature = t5_tokenizer(data["func_before"], truncation=True, max_length=1000)
  label = t5_tokenizer(data["explain"], truncation=True, max_length=30)
  return {
    "input_ids": input_feature["input_ids"],
    "attention_mask": input_feature["attention_mask"],
    "labels": label["input_ids"],
  }

tokenized_ds = ds.map(
  tokenize_sample_data,
  remove_columns=["CVE ID", "explain", "func_before"],
  batched=True,
  batch_size=128)

tokenized_ds

Map:   0%|          | 0/3870 [00:00<?, ? examples/s]

Map:   0%|          | 0/431 [00:00<?, ? examples/s]

Map:   0%|          | 0/1076 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3870
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 431
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1076
    })
})

In [6]:
import torch
from transformers import AutoConfig, AutoModelForSeq2SeqLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# see https://huggingface.co/docs/transformers/main_classes/configuration
mt5_config = AutoConfig.from_pretrained(
  "google/mt5-small",
  max_length=128,
  length_penalty=0.6,
  no_repeat_ngram_size=2,
  num_beams=15,
)
model = (AutoModelForSeq2SeqLM
         .from_pretrained("google/mt5-small", config=mt5_config)
         .to(device))

In [7]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
  t5_tokenizer,
  model=model,
  return_tensors="pt")

In [8]:
import evaluate
import numpy as np
from nltk.tokenize import RegexpTokenizer

rouge_metric = evaluate.load("rouge")

# define function for custom tokenization
def tokenize_sentence(arg):
  encoded_arg = t5_tokenizer(arg)
  return t5_tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

# define function to get ROUGE scores with custom tokenization
def metrics_func(eval_arg):
  preds, labels = eval_arg
  # Replace -100
  labels = np.where(labels != -100, labels, t5_tokenizer.pad_token_id)
  # Convert id tokens to text
  text_preds = t5_tokenizer.batch_decode(preds, skip_special_tokens=True)
  text_labels = t5_tokenizer.batch_decode(labels, skip_special_tokens=True)
  # Insert a line break (\n) in each sentence for ROUGE scoring
  # (Note : Please change this code, when you perform on other languages except for Japanese)
  text_preds = [(p if p.endswith(("!", "！", "?", "？", "。")) else p + "。") for p in text_preds]
  text_labels = [(l if l.endswith(("!", "！", "?", "？", "。")) else l + "。") for l in text_labels]
  sent_tokenizer_jp = RegexpTokenizer(u'[^!！?？。]*[!！?？。]')
  text_preds = ["\n".join(np.char.strip(sent_tokenizer_jp.tokenize(p))) for p in text_preds]
  text_labels = ["\n".join(np.char.strip(sent_tokenizer_jp.tokenize(l))) for l in text_labels]
  # compute ROUGE score with custom tokenization
  return rouge_metric.compute(
    predictions=text_preds,
    references=text_labels,
    tokenizer=tokenize_sentence
  )

In [9]:
from torch.utils.data import DataLoader

sample_dataloader = DataLoader(
  tokenized_ds["test"].with_format("torch"),
  collate_fn=data_collator,
  batch_size=3)
for batch in sample_dataloader:
  with torch.no_grad():
    preds = model.generate(
      batch["input_ids"].to(device),
      num_beams=15,
      num_return_sequences=1,
      no_repeat_ngram_size=1,
      remove_invalid_values=True,
      max_length=128,
    )
  labels = batch["labels"]
  break

metrics_func([preds, labels])

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'rouge1': 0.4222222222222222,
 'rouge2': 0.1858974358974359,
 'rougeL': 0.4222222222222222,
 'rougeLsum': 0.4138888888888889}

In [30]:
print(t5_tokenizer.batch_decode(preds, skip_special_tokens=True))

['<extra_id_0> -', '<extra_id_0>) return;', '<extra_id_0>? goto error; }']


In [27]:
t5_tokenizer.convert_ids_to_tokens(labels[1])

['▁out', '-', 'of', '-', 'bounds', '▁memory', '▁access', '</s>']

In [28]:
t5_tokenizer.convert_ids_to_tokens(preds[1])

['<pad>',
 '▁<extra_id_0>',
 ')',
 '▁return',
 ';',
 '</s>',
 '<pad>',
 '<pad>',
 '<pad>']