In [13]:
import os
import warnings
warnings.filterwarnings("ignore")
from IPython.display import Markdown, display

from datasets import Dataset
from datasets import load_dataset

# Transformers module
from transformers import AutoTokenizer

# 1. Load the Dataset from Data Preparation document

In [None]:
dataset = load_dataset("parquet", 
                        data_files = "./data/preprocessed_dataset.parquet", 
                        split="train")

print(f"Data type of the dataset object: {type(dataset)}")

# We will be applying sharding on huggingface dataset 
# Datasets that are Terabytes in size can be loaded and distributed parallely
dataset = dataset.shard(num_shards= 10, index = 0)

dataset[0:5]

Data type of the dataset object: <class 'datasets.arrow_dataset.Dataset'>


{'text': ['# Comparing the magnitudes of expressions of surds\n\nI recently tackled some questions on maths-challenge / maths-aptitude papers where the task was to order various expressions made up of surds (without a calculator, obviously).\n\nI found myself wondering whether I was relying too much on knowing the numerical value of some common surds, when a more robust method was available (and would work in more difficult cases).\n\nFor example, one question asked which is the largest of:\n\n(a) $\\sqrt{10}$\n(b) $\\sqrt2+\\sqrt3$\n(c) $5-\\sqrt3$\n\nIn this case, I relied on my knowledge that $\\sqrt{10} \\approx 3.16$ and $\\sqrt2\\approx 1.41$ and $\\sqrt3 \\approx 1.73$ to find (a) $\\approx 3.16$, (b) $\\approx ~3.14$ and (c) $\\approx ~3.27$ so that the required answer is (c).\n\nBut this seemed inelegant: I felt there might be some way to manipulate the surd expressions to make the ordering more explicit. I can\'t see what that might be, however (squaring all the expressions d

# 2. Tokenization

In [43]:
# We will be using wordpiece which resembles BPE (Bit Pair Encoding)
model_path_or_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(
                                        model_path_or_name,
                                        bos_token = "[BOS]", # Define the BOS token string
                                        eos_token = "[EOS]", # Define the EOS token string
                                        use_fast = True
                                    )
print(tokenizer._tokenizer.model)
print(f"BOS token ID: {tokenizer.bos_token_id}")
print(f"EOS token ID: {tokenizer.eos_token_id}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<tokenizers.models.WordPiece object at 0x134986210>
BOS token ID: 30522
EOS token ID: 30523


### 2.1. Tokenization Helper function

In [46]:
def tokenize_text(sample):
    tokens = tokenizer.tokenize(sample["text"]) # Tokenizer using BPE appraoch
    token_ids = tokenizer.convert_tokens_to_ids(tokens)  

    # Appending BOS and EOS token ids
    input_ids = [tokenizer.bos_token_id] + token_ids + [tokenizer.eos_token_id]
    sample["input_ids"] = input_ids

    # Save the count of total number of token ids
    sample["num_tokens"] = len(input_ids)

    return sample 

dataset = dataset.map(tokenize_text)

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [54]:
print(f"Total number of tokens : {sum(dataset['num_tokens'])}")

print("Sample Text and their input ids:")
sample = dataset[-1]

print("text :",sample["text"][:30])
print("Input Ids: ", sample["input_ids"][:30])

Total number of tokens : 191939
Sample Text and their input ids:
text : # Find the value of point A an
Input Ids:  [30522, 1001, 2424, 1996, 3643, 1997, 2391, 1037, 1998, 1038, 1015, 1012, 13292, 2656, 1010, 2325, 1001, 1001, 1001, 18789, 14634, 1035, 2442, 2050, 7592, 8519, 1012, 1045, 2572, 22476]
