In [3]:
!pip install -q transformers datasets torch scikit-learn

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [None]:
import os
import json
import torch
from torch.utils.data import Dataset
from datasets import load_dataset, concatenate_datasets
from transformers import GPT2Tokenizer
import pickle

class CustomTextDataset(Dataset):
    def __init__(self, tokenizer, directory):
        self.examples = []
        self.tokenizer = tokenizer
        # Read all files in the specified directory
        for filename in os.listdir(directory):
            if filename.endswith('.json'):
                with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                    data = json.load(file)
                    for item in data:
                        encoded = tokenizer(item['response_content'], truncation=True, max_length=512, padding='max_length', return_tensors='pt')
                        self.examples.append({key: val.squeeze(0) for key, val in encoded.items()})

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Function to tokenize dataset
def tokenize_function(examples, text_column):
    tokens = tokenizer(examples[text_column], truncation=True, max_length=512, padding='max_length', return_tensors='pt')
    tokens = {k: v.squeeze(0) for k, v in tokens.items()}  # Remove extra dimension
    return tokens

# Function to load and tokenize dataset using a very small subset
def load_and_tokenize_dataset(name, split, text_column):
    dataset = load_dataset(name, split=f"{split}", trust_remote_code=True)
    print(f"Loaded {name} dataset with {len(dataset)} samples")
    tokenized_dataset = dataset.map(lambda x: tokenize_function(x, text_column), batched=True, remove_columns=[text_column])
    print(f"Tokenized {name} dataset: {tokenized_dataset}")
    return tokenized_dataset

# Specify the directory containing the JSON files
directory_path = '/workspace/slice-monorepo/thebeast/notebooks/combined'

# Load the custom dataset
custom_dataset = CustomTextDataset(tokenizer, directory_path)

# Load other datasets
datasets = []
datasets.append(load_and_tokenize_dataset('openwebtext', 'train', 'text'))
datasets.append(load_and_tokenize_dataset('bookcorpus', 'train', 'text'))
datasets.append(load_and_tokenize_dataset('gigaword', 'train', 'document'))

# Combine datasets
all_datasets = [custom_dataset] + [d['input_ids'] for d in datasets]
combined_dataset = concatenate_datasets(all_datasets)
print(f"Combined dataset has {len(combined_dataset)} samples")

# Split the dataset into training and testing sets
train_size = int(0.8 * len(combined_dataset))
test_size = len(combined_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(combined_dataset, [train_size, test_size])

# Save datasets
with open('train_dataset.pkl', 'wb') as f:
    pickle.dump(train_dataset, f)
with open('test_dataset.pkl', 'wb') as f:
    pickle.dump(test_dataset, f)
print("Datasets saved!")


Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

Loaded openwebtext dataset with 8013769 samples


Map:   0%|          | 0/8013769 [00:00<?, ? examples/s]

In [None]:
import os
import torch
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, get_linear_schedule_with_warmup
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import pickle

# Load datasets
with open('train_dataset.pkl', 'rb') as f:
    train_dataset = pickle.load(f)
with open('test_dataset.pkl', 'rb') as f:
    test_dataset = pickle.load(f)

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Option to initialize the model from scratch
def initialize_model(from_pretrained=True):
    if from_pretrained:
        model = GPT2LMHeadModel.from_pretrained('distilgpt2')
    else:
        config = GPT2LMHeadModel.config_class()
        model = GPT2LMHeadModel(config)
    model.resize_token_embeddings(len(tokenizer))  # Adjust the model's embedding size to account for new tokens
    return model

# Initialize model
model = initialize_model(from_pretrained=True)  # Change to False to train from scratch

# Set device and DataParallel
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = nn.DataParallel(model)

# Define the optimizer and scheduler
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_dataloader) * 3  # Assuming 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
train_losses = []
eval_losses = []
model.train()
for epoch in range(3):  # Number of epochs
    epoch_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.clone()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss.mean()  # Aggregate the loss to a scalar value
        loss.backward()
        optimizer.step()
        scheduler.step()
        epoch_loss += loss.item()
    avg_train_loss = epoch_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)
    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}")

# Evaluation
model.eval()
total_eval_loss = 0
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.clone()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss.mean()  # Aggregate the loss to a scalar value
        total_eval_loss += loss.item()
    avg_eval_loss = total_eval_loss / len(test_dataloader)
    eval_losses.append(avg_eval_loss)
    print(f"Average Evaluation Loss: {avg_eval_loss}")

# Save the model
model_path = 'distilgpt2-trained'
if isinstance(model, nn.DataParallel):
    model.module.save_pretrained(model_path)
else:
    model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

print("Training complete!")

# Plotting the training and evaluation losses
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.plot(eval_losses, label='Evaluation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Evaluation Losses')
plt.legend()
plt.show()

# Simple validation
model.eval()
sample_text = "Once upon a time"
encoded_input = tokenizer(sample_text, return_tensors='pt').to(device)
if isinstance(model, nn.DataParallel):
    output = model.module.generate(encoded_input['input_ids'], max_length=50)
else:
    output = model.generate(encoded_input['input_ids'], max_length=50)
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(f"Sample Output: {decoded_output}")


In [None]:
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import GPT2Tokenizer

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Add padding token

# Function to tokenize dataset
def tokenize_function(examples, text_column):
    tokens = tokenizer(examples[text_column], padding='max_length', truncation=True, max_length=128)
    return tokens

# Function to load and tokenize dataset
def load_and_tokenize_dataset(name, split, text_column):
    dataset = load_dataset(name, split=split, trust_remote_code=True)
    print(f"Loaded {name} dataset with {len(dataset)} samples")
    tokenized_dataset = dataset.map(lambda x: tokenize_function(x, text_column), batched=True, remove_columns=[text_column])
    print(f"Tokenized {name} dataset: {tokenized_dataset}")
    return tokenized_dataset

# Load datasets
datasets = []

# Uncomment the datasets you want to use
# Wikipedia
# datasets.append(load_and_tokenize_dataset('wikipedia', '20220301.en[:1%]', 'text'))
# OpenWebText
datasets.append(load_and_tokenize_dataset('openwebtext', 'train[:1%]', 'text'))
# BooksCorpus
datasets.append(load_and_tokenize_dataset('bookcorpus', 'train[:1%]', 'text'))
# English Gigaword
datasets.append(load_and_tokenize_dataset('gigaword', 'train[:1%]', 'document'))

# Combine datasets
combined_dataset = concatenate_datasets(datasets)
print(f"Combined dataset has {len(combined_dataset)} samples")

# Split the dataset into training and testing sets
train_size = int(0.8 * len(combined_dataset))
test_size = len(combined_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(combined_dataset, [train_size, test_size])

# Create DataLoaders
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

# Print dataset and dataloader information
print(f"Train dataset has {len(train_dataset)} samples")
print(f"Test dataset has {len(test_dataset)} samples")
print("DataLoader configuration:")
print(f"Train DataLoader: {len(train_dataloader)} batches")
print(f"Test DataLoader: {len(test_dataloader)} batches")


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.35k [00:00<?, ?B/s]

Downloading data:   0%|          | 0/21 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/8013769 [00:00<?, ? examples/s]

Loaded openwebtext dataset with 80138 samples


Map:   0%|          | 0/80138 [00:00<?, ? examples/s]

Tokenized openwebtext dataset: Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 80138
})


Downloading builder script:   0%|          | 0.00/3.25k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/18.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/74004228 [00:00<?, ? examples/s]

In [None]:
import torch
import pickle

# Save the combined dataset and DataLoader configuration
save_path = 'combined_dataset.pkl'

with open(save_path, 'wb') as f:
    pickle.dump({
        'combined_dataset': combined_dataset,
        'train_dataset': train_dataset,
        'test_dataset': test_dataset,
        'train_dataloader': train_dataloader,
        'test_dataloader': test_dataloader,
    }, f)

print(f"Saved combined dataset and DataLoader configuration to {save_path}")


In [None]:
import torch
import pickle

# Load the combined dataset and DataLoader configuration
load_path = 'combined_dataset.pkl'

with open(load_path, 'rb') as f:
    data = pickle.load(f)

combined_dataset = data['combined_dataset']
train_dataset = data['train_dataset']
test_dataset = data['test_dataset']
train_dataloader = data['train_dataloader']
test_dataloader = data['test_dataloader']

print(f"Loaded combined dataset with {len(combined_dataset)} samples")
print(f"Train dataset has {len(train_dataset)} samples")
print(f"Test dataset has {len(test_dataset)} samples")
print("DataLoader configuration:")
print(f"Train DataLoader: {len(train_dataloader)} batches")
print(f"Test DataLoader: {len(test_dataloader)} batches")


In [None]:
import torch
from transformers import GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import os

# Initialize model
model = GPT2LMHeadModel.from_pretrained('distilgpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Ensure padding token is added
model.resize_token_embeddings(len(tokenizer))  # Resize embeddings to accommodate the new pad token

# Set device to CUDA or CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Improved custom collate function to handle None entries and ensure all elements are tensors
def custom_collate(batch):
    batch = [b for b in batch if b is not None]
    if len(batch) == 0:
        return None
    collated_batch = {}
    for key in batch[0]:
        if key == 'summary':
            continue
        filtered = [b[key] for b in batch if b[key] is not None]
        if len(filtered) > 0:
            if isinstance(filtered[0], list):
                filtered = [torch.tensor(f) for f in filtered]
            collated_batch[key] = torch.stack(filtered, dim=0)
        else:
            collated_batch[key] = None
    return collated_batch

# Create DataLoader with custom collate function
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=2, collate_fn=custom_collate, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=2, collate_fn=custom_collate, shuffle=False)

# Define the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_dataloader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
model.train()
for epoch in range(1):  # Run for 3 epochs
    epoch_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
        if batch is None:  # Skip empty batches
            continue
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.clone()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {epoch_loss / len(train_dataloader)}")

# Evaluation
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        if batch is None:  # Skip empty batches
            continue
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.clone().to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Save the model
model.save_pretrained('./distilgpt2-trained')
tokenizer.save_pretrained('./distilgpt2-trained')

print("Training complete!")
