In [None]:
!pip install --upgrade --quiet pip

In [None]:
!pip install --upgrade --quiet transformers datasets sentencepiece tqdm huggingface-hub

In [3]:
from datasets import load_dataset
from torch import Tensor
from transformers import T5TokenizerFast

In [4]:
data = load_dataset("rusano/ELI5_custom")

Found cached dataset parquet (C:/Users/khann/.cache/huggingface/datasets/rusano___parquet/rusano--ELI5_custom-53760243fd2b2ddc/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
print(f"Features   : {', '.join(data['test'].features)}")
print("Train set  :", data["train"].num_rows)
print("Valid set  :", data["val"].num_rows)
print("Test set   :", data["test"].num_rows)

Features   : question, answer, context
Train set  : 196296
Valid set  : 49074
Test set   : 1507


In [6]:
CHECKPOINT = "t5-base"
TOKENIZER = T5TokenizerFast.from_pretrained(CHECKPOINT)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [21]:
def encode(example):
    input = f"{example['question']}</s>{example['context']}</s>"
    target = f"{example['answer']}</s>"

    input_encoding = TOKENIZER.encode_plus(
        input,
        padding="longest",
        add_special_tokens=True,
        return_tensors="pt",
    )

    target_encoding = TOKENIZER.encode_plus(
        target,
        padding="longest",
        add_special_tokens=True,
        return_tensors="pt",
    )

    return {
        "input_ids": Tensor(input_encoding["input_ids"].squeeze()).size()[0],
        "labels": Tensor(target_encoding["input_ids"].squeeze()).size()[0],
    }

In [22]:
data_encode = data.map(encode, remove_columns=data["train"].column_names)

Map:   0%|          | 0/196296 [00:00<?, ? examples/s]

Map:   0%|          | 0/1507 [00:00<?, ? examples/s]

Map:   0%|          | 0/49074 [00:00<?, ? examples/s]

In [43]:
import numpy as np

input_len = np.array([])
target_len = np.array([])

for set in data_encode:
    input_len = np.append(input_len, data_encode[set]["input_ids"])
    target_len = np.append(target_len, data_encode[set]["labels"])

print(np.max(input_len))
print(np.mean(input_len))
print(np.max(target_len))
print(np.mean(target_len))

501.0
156.5317020216545
9257.0
165.96153550148455


In [44]:
INPUT_LEN = 512
TARGET_LEN = 256

In [45]:
def batch_tokenize(batch):
    inputs = []
    target = []
    for ids in range(len(batch["question"])):
        inputs.append(f"{batch['question'][ids]}</s>{batch['context'][ids]}</s>")
        target.append(f"{batch['answer'][ids]}</s>")

    input_encoding = TOKENIZER.batch_encode_plus(
        inputs,
        max_length=INPUT_LEN,
        padding="max_length",
        truncation=True,
        add_special_tokens=True,
        return_tensors="pt",
    )

    target_encoding = TOKENIZER.batch_encode_plus(
        target,
        max_length=TARGET_LEN,
        padding="max_length",
        truncation=True,
        add_special_tokens=True,
        return_tensors="pt",
    )

    return {
        "input_ids": Tensor(input_encoding["input_ids"].squeeze()),
        "attention_mask": Tensor(input_encoding["attention_mask"].squeeze()),
        "labels": Tensor(target_encoding["input_ids"].squeeze()),
        "decoder_attention_mask": Tensor(target_encoding["attention_mask"].squeeze()),
    }

In [46]:
data_encode = data.map(
    batch_tokenize, batched=True, remove_columns=data["train"].column_names
)

Map:   0%|          | 0/196296 [00:00<?, ? examples/s]

Map:   0%|          | 0/1507 [00:00<?, ? examples/s]

Map:   0%|          | 0/49074 [00:00<?, ? examples/s]

In [49]:
data_encode

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
        num_rows: 196296
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
        num_rows: 1507
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
        num_rows: 49074
    })
})

In [50]:
data_encode.push_to_hub("rusano/ELI5_custom_encoded", max_shard_size="1GB")

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/99 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/99 [00:00<?, ?ba/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Pushing split val to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]