In [5]:
# !pip install transformers, AutoTokenizer, torch

In [14]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import torch

In [6]:
# 1. Load a text dataset (we use a small example dataset for demonstration)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")  # raw text WikiText-2
print(f"Number of lines in dataset: {len(dataset)}")

README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Number of lines in dataset: 36718


In [7]:
# 2. Initialize a tokenizer (we'll use GPT-2's tokenizer for compatibility with a GPT-2 model)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad by default

In [8]:
# 3. Tokenize the dataset efficiently using `.map` with batched processing
def tokenize_function(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=False)

tokenized_ds = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# The dataset now has columns like 'input_ids' and 'attention_mask'

print(tokenized_ds[0]["input_ids"][:20])  # print first 20 token IDs of first example for sanity check

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

[]


In [10]:
# 4. Slice into training sequences of fixed length
# For language model training, often we concatenate all texts then split into blocks of e.g. 128 or 512 tokens.
block_size = 128
def group_texts(examples):
    # Concatenate each field
    concatenated_inputs = sum(examples["input_ids"], [])
    concatenated_masks = sum(examples["attention_mask"], [])

    total_len = (len(concatenated_inputs) // block_size) * block_size
    concatenated_inputs = concatenated_inputs[:total_len]
    concatenated_masks = concatenated_masks[:total_len]

    # Split into chunks
    result_input_ids = [concatenated_inputs[i:i+block_size] for i in range(0, total_len, block_size)]
    result_masks = [concatenated_masks[i:i+block_size] for i in range(0, total_len, block_size)]

    return {"input_ids": result_input_ids, "attention_mask": result_masks}


lm_ds = tokenized_ds.map(group_texts, batched=True, batch_size=1000)
print(f"LM training sequences: {len(lm_ds)}")

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

LM training sequences: 18667


In [12]:
# 5. Create a DataLoader for the tokenized, grouped dataset
# We'll use a custom collate to dynamically pad sequences (though all are same length here by construction)
def collate_fn(batch):
    # Since our sequences are fixed length after grouping, we might just stack them.
    # If they weren't fixed, we could use tokenizer.pad to pad to max length in batch.
    input_ids = torch.tensor([example["input_ids"] for example in batch], dtype=torch.long)
    # For language modeling, labels are the input_ids shifted by one, but 
    # Transformers' CausalLM models usually handle that internally if we provide labels = input_ids.
    return {"input_ids": input_ids, "labels": input_ids.clone()}

train_loader = DataLoader(lm_ds, batch_size=8, shuffle=True, collate_fn=collate_fn)

In [15]:
# 6. Iterate through a couple of batches to see that it works
for batch in train_loader:
    print(batch["input_ids"].shape, batch["labels"].shape)
    break

torch.Size([8, 128]) torch.Size([8, 128])
