In [None]:
!pip install --upgrade --quiet pip

In [None]:
!pip install --upgrade --quiet datasets sentencepiece tqdm huggingface-hub

In [1]:
from datasets import Dataset, DatasetDict, load_dataset

In [2]:
DATA_DIR = "../data"
TRAIN_FILE = "ELI5_train.jsonl"
TEST_FILE = "ELI5_val.jsonl"
COLUMN_REMOVE = ["question_id"]
BATCH_SIZE = 1000

In [3]:
data = DatasetDict(
    {
        "train": load_dataset(
            "json", name="eli5_train", data_dir=DATA_DIR, data_files=TRAIN_FILE
        )["train"].remove_columns(COLUMN_REMOVE),
        "test": load_dataset(
            "json", name="eli5_test", data_dir=DATA_DIR, data_files=TEST_FILE
        )["train"].remove_columns(COLUMN_REMOVE),
    }
)

Downloading and preparing dataset json/eli5_train to C:/Users/khann/.cache/huggingface/datasets/json/eli5_train-1a601a67e1b6c262/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to C:/Users/khann/.cache/huggingface/datasets/json/eli5_train-1a601a67e1b6c262/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset json/eli5_test to C:/Users/khann/.cache/huggingface/datasets/json/eli5_test-25eb913cb81c18cc/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to C:/Users/khann/.cache/huggingface/datasets/json/eli5_test-25eb913cb81c18cc/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
data

DatasetDict({
    train: Dataset({
        features: ['question', 'answers', 'ctxs'],
        num_rows: 272634
    })
    test: Dataset({
        features: ['question', 'answers', 'ctxs'],
        num_rows: 1507
    })
})

In [5]:
def remove_score(batch):
    ctxs = []
    for ids in range(len(batch["ctxs"])):
        contexts = []
        for context in batch["ctxs"][ids]:
            contexts.append(context[0])
        ctxs.append(contexts)
    return {"ctxs": ctxs}

In [6]:
def one_pair_qc(batch):
    answers = []
    ctxs = []
    for ids in range(len(batch["ctxs"])):
        answers.append(batch["answers"][ids][0])
        ctxs.append(batch["ctxs"][ids][0])

    return {"answer": answers, "context": ctxs}

In [14]:
def train_test_split(data, train_size=None, val_size=None):
    size = data["train"].num_rows
    if train_size is None and val_size is None:
        print("Must assignt at least train_size or val_size!")
    elif train_size is None:
        train_size = size - val_size
    elif val_size is None:
        val_size = size - train_size
    train_size = int(size * train_size)
    val_size = int(size * val_size)
    train = Dataset.from_dict(data["train"][:train_size])
    val = Dataset.from_dict(data["train"][train_size : train_size + val_size])
    test = Dataset.from_dict(data["test"][:])
    return DatasetDict(
        {
            "train": train,
            "val": val,
            "test": test,
        }
    )

In [8]:
data["test"] = data["test"].map(
    remove_score,
    batched=True,
    remove_columns=["ctxs"],
)

Map:   0%|          | 0/1507 [00:00<?, ? examples/s]

In [9]:
data = data.map(
    one_pair_qc,
    batched=True,
    remove_columns=["answers", "ctxs"],
)

Map:   0%|          | 0/272634 [00:00<?, ? examples/s]

Map:   0%|          | 0/1507 [00:00<?, ? examples/s]

In [17]:
data = train_test_split(data, train_size=0.8)

In [19]:
data.push_to_hub("rusano/ELI5_custom", max_shard_size="1GB")

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/197 [00:00<?, ?ba/s]

Pushing split val to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]