In [None]:
import os
from datasets import load_dataset
from transformers import AutoTokenizer
from itertools import chain

## Tokenizer path
tokenizer_path = "./wiki2tokenizer"

## Get data
dataset = load_dataset("wikitext", name="wikitext-2-raw-v1")
splits = dataset.keys()


## Function to finetune a tokenizer from based on the dataset 
## Train a new tokenizer using Wiki dataset from GPT2 tokenizer
batch_size = 1000
vocab_size = 25000
num_proc = 6 ## num cpu cores // 2

def batch_iterator(splits):
    for split in splits:
        for i in range(0, len(dataset[split]), batch_size):
            yield dataset[split][i : i + batch_size]["text"]

def tune_tokenizer():
    gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
    print(gpt2_tokenizer.is_fast)
    tokenizer = gpt2_tokenizer.train_new_from_iterator(batch_iterator(splits), vocab_size=25000)
    return tokenizer


def tokenize(example):
    tokens = {}
    tokenized_dataset = tokenizer(example)
    # tokens['train'] = tokenized_dataset['train']['input_ids']
    # tokens['validation'] = tokenized_dataset['validation']['input_ids']
    # tokens['test'] = tokenized_dataset['test']['input_ids']
    return  tokenized_dataset


In [None]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
# dataset = dataset.map(lambda examples: tokenizer(examples["text"]), 
#                       batched=True,
#                       desc="Tokenizing the splits",
#                       num_proc=num_proc,)
tokenized_dataset = dataset.map(lambda example: tokenize(example['text']), 
                        batched=True,
                        remove_columns=['text'],
                        desc="Tokenizing the splits",
                        num_proc=num_proc,)

for split, dset in  tokenized_dataset.items():
    print(split)
    print(dset)
# tokens = {}
# for s in tokenized_dataset:
#     tokens[s] = list(chain.from_iterable(tokenized_dataset[s]['input_ids']))

# print(tokens)

In [None]:
print(len(tokens['train']), len(tokens['validation']), len(tokens['test']))

In [None]:
tokenized_dataset['train']['input_ids']

In [1]:
import pickle
## Load tokenized datasets
dataset_path = "./wiki2tokens.bin"
with open(dataset_path, "rb") as f:
    loaded_objects = pickle.load(f)

# Unpack the tuple of objects
vocab_size, tokenized_text, tokens_dataset = loaded_objects
tokenized_text

{'test': tensor([  301,  2480, 16377,  ...,  6961,   272,   315]),
 'train': tensor([  301,  9823, 10724,  ..., 11340,   272,   315]),
 'validation': tensor([  301, 14261, 13975,  ...,   301,   301,   315])}

In [1]:
import torch

num_gpus = torch.cuda.device_count()
print("Number of GPUs available:", num_gpus)

Number of GPUs available: 0
