In [None]:
!pip install -q transformers datasets torch scikit-learn

In [1]:
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import GPT2Tokenizer

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Add padding token

# Function to tokenize dataset
def tokenize_function(examples, text_column):
    tokens = tokenizer(examples[text_column], padding='max_length', truncation=True, max_length=128)
    return tokens

# Function to load and tokenize dataset
def load_and_tokenize_dataset(name, split, text_column):
    dataset = load_dataset(name, split=split, trust_remote_code=True)
    print(f"Loaded {name} dataset with {len(dataset)} samples")
    tokenized_dataset = dataset.map(lambda x: tokenize_function(x, text_column), batched=True, remove_columns=[text_column])
    print(f"Tokenized {name} dataset: {tokenized_dataset}")
    return tokenized_dataset

# Load datasets
datasets = []

# Uncomment the datasets you want to use
# Wikipedia
# datasets.append(load_and_tokenize_dataset('wikipedia', '20220301.en[:1%]', 'text'))
# OpenWebText
datasets.append(load_and_tokenize_dataset('openwebtext', 'train[:1%]', 'text'))
# BooksCorpus
datasets.append(load_and_tokenize_dataset('bookcorpus', 'train[:1%]', 'text'))
# English Gigaword
datasets.append(load_and_tokenize_dataset('gigaword', 'train[:1%]', 'document'))

# Combine datasets
combined_dataset = concatenate_datasets(datasets)
print(f"Combined dataset has {len(combined_dataset)} samples")

# Split the dataset into training and testing sets
train_size = int(0.8 * len(combined_dataset))
test_size = len(combined_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(combined_dataset, [train_size, test_size])


Loaded openwebtext dataset with 80138 samples
Tokenized openwebtext dataset: Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 80138
})
Loaded bookcorpus dataset with 740042 samples
Tokenized bookcorpus dataset: Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 740042
})
Loaded gigaword dataset with 38040 samples
Tokenized gigaword dataset: Dataset({
    features: ['summary', 'input_ids', 'attention_mask'],
    num_rows: 38040
})
Combined dataset has 858220 samples


In [None]:
import torch
from transformers import GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import os

# Initialize model
model = GPT2LMHeadModel.from_pretrained('distilgpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Ensure padding token is added
model.resize_token_embeddings(len(tokenizer))  # Resize embeddings to accommodate the new pad token

# Set device to CUDA or CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Improved custom collate function to handle None entries and ensure all elements are tensors
def custom_collate(batch):
    batch = [b for b in batch if b is not None]
    if len(batch) == 0:
        return None
    collated_batch = {}
    for key in batch[0]:
        if key == 'summary':
            continue
        filtered = [b[key] for b in batch if b[key] is not None]
        if len(filtered) > 0:
            if isinstance(filtered[0], list):
                filtered = [torch.tensor(f) for f in filtered]
            collated_batch[key] = torch.stack(filtered, dim=0)
        else:
            collated_batch[key] = None
    return collated_batch

# Create DataLoader with custom collate function
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=200, collate_fn=custom_collate, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=200, collate_fn=custom_collate, shuffle=False)

# Define the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_dataloader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
model.train()
for epoch in range(1):
    epoch_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
        if batch is None:
            continue
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.clone()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {epoch_loss / len(train_dataloader)}")

# Evaluation
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        if batch is None:  # Skip empty batches
            continue
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.clone().to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Save the model
model.save_pretrained('./distilgpt2-trained')
tokenizer.save_pretrained('./distilgpt2-trained')

print("Training complete!")


Training Epoch 1:  14%|█▎        | 466/3433 [05:43<37:03,  1.33it/s]