In [4]:
import transformers
from datasets import load_dataset, load_metric

In [10]:
data = load_dataset("csv",data_files="dataset/data.csv")
# test_data = load_dataset("csv",data_files="dataset/test_data.csv")

Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1429.06it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 400.99it/s]
Generating train split: 264 examples [00:00, 4486.34 examples/s]


In [11]:
data

DatasetDict({
    train: Dataset({
        features: ['tools', 'question', 'answer'],
        num_rows: 264
    })
})

In [13]:
datasets_train_test = data["train"].train_test_split(test_size=72)
datasets_train_validation = datasets_train_test["train"].train_test_split(test_size=50)

data["train"] = datasets_train_validation["train"]
data["validation"] = datasets_train_validation["test"]
data["test"] = datasets_train_test["test"]

In [14]:
data

DatasetDict({
    train: Dataset({
        features: ['tools', 'question', 'answer'],
        num_rows: 142
    })
    validation: Dataset({
        features: ['tools', 'question', 'answer'],
        num_rows: 50
    })
    test: Dataset({
        features: ['tools', 'question', 'answer'],
        num_rows: 72
    })
})

In [23]:
from transformers import AutoTokenizer

In [24]:
model_checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading config.json: 100%|██████████| 1.21k/1.21k [00:00<00:00, 6.39MB/s]
Downloading spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 849kB/s]
Downloading tokenizer.json: 100%|██████████| 1.39M/1.39M [00:00<00:00, 1.48MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [25]:
prefix = "question: "
max_input_length = 512
max_target_length = 64

prompt = '\n\n'

def preprocess_data(examples):
  inputs = [ tools + prompt + prefix + question for tools, question in zip(examples["tools"],examples["question"])]
  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

  # Setup the tokenizer for targets
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples["answer"], max_length=max_target_length, 
                       truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [26]:
tokenized_datasets = data.map(preprocess_data, batched=True)
tokenized_datasets

Map: 100%|██████████| 142/142 [00:00<00:00, 200.43 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 202.66 examples/s]
Map: 100%|██████████| 72/72 [00:00<00:00, 212.01 examples/s]


DatasetDict({
    train: Dataset({
        features: ['tools', 'question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 142
    })
    validation: Dataset({
        features: ['tools', 'question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 50
    })
    test: Dataset({
        features: ['tools', 'question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 72
    })
})

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
batch_size = 8
model_name = "t5-base"
model_dir = f"model/{model_name}"
args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer)