In [1]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F


NUM_PROC = 24

# Load the dataset
dataset = load_dataset("wikipedia", language="en", date="20240401", split='train[:5%]', trust_remote_code=True, num_proc=NUM_PROC)
tokenizer_path = 'cl100k_base.tiktoken'  # Update this path to your tokenizer.json

# Load your tokenizer
from tokenizer import Tokenizer  # Assuming your tokenizer script is named tokenizer.py
tokenizer = Tokenizer(tokenizer_path)

# Function to tokenize the text
def tokenize_function(examples):
    # The tokenizer.encode function expects a string, so process each text entry in the batch
    input_ids = [tokenizer.encode(text, bos=True, eos=True) for text in examples['text']]
    return {'input_ids': input_ids}

# Apply the tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=NUM_PROC)

# Set the format of the dataset to torch
tokenized_datasets.set_format('torch', columns=['input_ids'])

max_seq_len = 2048  # Define a maximum sequence length

def collate_batch(batch):
    # Ensure tensors are detached from their computation graph
    input_ids_list = [item['input_ids'].clone().detach().to(torch.long) for item in batch]

    # Define a maximum sequence length
    max_seq_len = 2048  # Adjust as necessary

    # Pad or truncate all sequences to the same length
    padded_or_truncated_input_ids = [
        ids[:max_seq_len] if len(ids) > max_seq_len else F.pad(ids, (0, max_seq_len - len(ids)), value=tokenizer.pad_id)
        for ids in input_ids_list
    ]
    padded_input_ids = pad_sequence(padded_or_truncated_input_ids, batch_first=True, padding_value=tokenizer.pad_id)
    return {'input_ids': padded_input_ids}


train_dataloader = DataLoader(tokenized_datasets, batch_size=32, shuffle=True, collate_fn=collate_batch)



In [2]:
import torch
import torch.distributed as dist
from fairscale.nn.model_parallel import initialize_model_parallel
from model import Transformer, ModelArgs

# Set up distributed environment
def setup_distributed(world_size=1, rank=0):
    if not dist.is_initialized():
        dist.init_process_group(
            backend='nccl',  # Use 'nccl' for GPUs, 'gloo' for CPU or multi-GPU setups
            init_method='tcp://localhost:23456',  # Address for initializing communication
            world_size=world_size,  # Total number of processes
            rank=rank  # Rank of the current process
        )
        # Initialize model parallelism
        model_parallel_size = 1  # Adjust this as per your setup
        initialize_model_parallel(model_parallel_size)

# Initialize distributed environment
setup_distributed()


# Now proceed to define your model parameters and create the model
model_args = ModelArgs(
    vocab_size=tokenizer.get_vocab_size(),
    dim=512,
    n_layers=6,
    n_heads=8,
    ffn_dim_multiplier=4
)

model = Transformer(model_args)


> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1


In [3]:
from torch.optim import AdamW

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
model.train()
for epoch in range(1):  # For simplicity, assuming 1 epoch
    for i, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        
        # Call the model's forward method with the necessary parameters
        outputs = model(input_ids, start_pos=0)

        # Assuming that your model returns a dictionary with 'loss'
        loss = outputs['loss']
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch}, Batch {i}, Loss: {loss.item()}")


torch.save(model.state_dict(), 'llm_model.pth')


  input_ids_list = [torch.tensor(item['input_ids'], dtype=torch.long) for item in batch]


NameError: name 'F' is not defined