In [1]:
from typing import List, Dict, Any

from datasets import load_dataset, concatenate_datasets
from datasets import DatasetDict
from transformers import AutoTokenizer
from multiprocessing import cpu_count
import random
import numpy as np

random.seed(93)
np.random.seed(93)

In [2]:
bert_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
bert_tokenizer.sep_token

'</s>'

In [3]:
bert_tokenizer(["The quick brown fox", "jumped over the lazy dog"])

{'input_ids': [[0, 133, 2119, 6219, 23602, 2], [0, 267, 25844, 81, 5, 22414, 2335, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [4]:
ds = load_dataset("glue", "mnli")
ds

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

In [5]:
ds["eval"] = concatenate_datasets([ds["validation_matched"], ds["validation_mismatched"]])
del ds["validation_matched"]
del ds["validation_mismatched"]
ds

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
    eval: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 19647
    })
})

In [6]:
ds["train"][0]["premise"]

'Conceptually cream skimming has two basic dimensions - product and geography.'

In [7]:
len(ds["train"]["premise"])

392702

In [8]:
def create_sentence_pairs(batch):
    result = {
        "input_ids": [],
        "attention_mask": []
    }
    for i in range(len(batch["premise"])):
        both = f"premise: {batch['premise'][i]}\nhypothesis: {batch['hypothesis'][i]}"
        both_tokenized = bert_tokenizer(both)

        result["input_ids"].append(both_tokenized["input_ids"])
        result["attention_mask"].append(both_tokenized["attention_mask"])

    return result


ds = ds.map(create_sentence_pairs, batched=True, num_proc=cpu_count() - 1)
ds = ds.remove_columns(["premise", "hypothesis"])
ds

Map (num_proc=11):   0%|          | 0/392702 [00:00<?, ? examples/s]

Map (num_proc=11):   0%|          | 0/9796 [00:00<?, ? examples/s]

Map (num_proc=11):   0%|          | 0/9847 [00:00<?, ? examples/s]

Map (num_proc=11):   0%|          | 0/19647 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 392702
    })
    test_matched: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9847
    })
    eval: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 19647
    })
})

In [9]:
bert_tokenizer.decode(ds["train"][0]["input_ids"])

2023-12-30 19:54:54.531597: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-30 19:54:54.531630: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-30 19:54:54.532483: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-30 19:54:54.537248: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'<s>premise: Conceptually cream skimming has two basic dimensions - product and geography. hypothesis: Product and geography are what make cream skimming work. </s>'

In [10]:
ds.save_to_disk("./data/mnli_processed")

Saving the dataset (0/1 shards):   0%|          | 0/392702 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9796 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9847 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19647 [00:00<?, ? examples/s]