In [1]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import torch.distributed as dist
from fairscale.nn.model_parallel import initialize_model_parallel
from model import Transformer, ModelArgs
from tokenizer import Tokenizer  # Assuming your tokenizer script is named tokenizer.py
from torch.optim import AdamW

In [2]:
NUM_PROC = 24
BATCH_SIZE = 8  # Reduced batch size
MAX_SEQ_LEN = 1024  # Reduced sequence length

# Load the dataset
dataset = load_dataset("wikipedia", language="en", date="20240401", split='train[:5%]', trust_remote_code=True, num_proc=NUM_PROC)
tokenizer_path = 'cl100k_base.tiktoken'
tokenizer = Tokenizer(tokenizer_path)

def tokenize_function(examples):
    input_ids = [tokenizer.encode(text, bos=True, eos=True) for text in examples['text']]
    return {'input_ids': input_ids}

tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=NUM_PROC)
tokenized_datasets.set_format('torch', columns=['input_ids'])

def collate_batch(batch):
    input_ids_list = [item['input_ids'].clone().detach().to(torch.long) for item in batch]
    padded_input_ids = [
        ids[:MAX_SEQ_LEN] if len(ids) > MAX_SEQ_LEN else F.pad(ids, (0, MAX_SEQ_LEN - len(ids)), value=tokenizer.pad_id)
        for ids in input_ids_list
    ]
    return {'input_ids': pad_sequence(padded_input_ids, batch_first=True, padding_value=tokenizer.pad_id)}

train_dataloader = DataLoader(tokenized_datasets, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

In [3]:
# Set up distributed environment
def setup_distributed(world_size=1, rank=0):
    if not dist.is_initialized():
        dist.init_process_group(
            backend='nccl',  # Use 'nccl' for GPUs, 'gloo' for CPU or multi-GPU setups
            init_method='tcp://localhost:23456',  # Address for initializing communication
            world_size=world_size,  # Total number of processes
            rank=rank  # Rank of the current process
        )
        # Initialize model parallelism
        model_parallel_size = 1  # Adjust this as per your setup
        initialize_model_parallel(model_parallel_size)

setup_distributed()

model_args = ModelArgs(
    vocab_size=tokenizer.get_vocab_size(),
    dim=512,
    n_layers=6,
    n_heads=8,
    ffn_dim_multiplier=4
)
model = Transformer(model_args)

def print_model_details(model):
    total_params = 0
    print("Model Layers and Parameters:")
    for name, param in model.named_parameters():
        param_count = param.numel()
        total_params += param_count
        print(f"{name}, Shape: {param.size()}, Parameters: {param_count}")
    print(f"Total Parameters: {total_params}")


print_model_details(model)  # Function defined in previous messages

> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1
Model Layers and Parameters:
tok_embeddings.weight, Shape: torch.Size([100512, 512]), Parameters: 51462144
layers.0.attention.wq.weight, Shape: torch.Size([512, 512]), Parameters: 262144
layers.0.attention.wk.weight, Shape: torch.Size([512, 512]), Parameters: 262144
layers.0.attention.wv.weight, Shape: torch.Size([512, 512]), Parameters: 262144
layers.0.attention.wo.weight, Shape: torch.Size([512, 512]), Parameters: 262144
layers.0.feed_forward.w1.weight, Shape: torch.Size([5632, 512]), Parameters: 2883584
layers.0.feed_forward.w2.weight, Shape: torch.Size([512, 5632]), Parameters: 2883584
layers.0.feed_forward.w3.weight, Shape: torch.Size([5632, 512]), Parameters: 2883584
layers.0.attention_norm.weight, Shape: torch.Size([512]), Parameters: 512
layers.0.ffn_norm.weight, Shape: torch.Size([512]), Parameters: 512
layers.1.attention.wq.weight, Shape: torch.Size([512, 512]), Parame

In [4]:
import torch
from torch.utils.data import DataLoader
from torch.nn.functional import cross_entropy

# Assuming you've already initialized your model, dataloader, and other components

# Optimizer setup
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
model.train()
for epoch in range(1):  # Assuming you want to train for a certain number of epochs
    accumulation_steps = 4
    optimizer.zero_grad()
    for i, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        labels = input_ids.clone()  # Assuming the labels are the input_ids for an auto-regressive model

        optimizer.zero_grad()
        outputs = model(input_ids, start_pos=0)  # Outputs are logits

        # Shift labels to match the prediction shift: predict next token
        shift_logits = outputs[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        # Flatten the logits and labels to fit into cross_entropy
        loss = cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        loss.backward()
        
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        print(f"Epoch {epoch}, Batch {i}, Loss: {loss.item()}")

torch.save(model.state_dict(), 'llm_model.pth')

OutOfMemoryError: CUDA out of memory. Tried to allocate 6.13 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 21.48 GiB is allocated by PyTorch, and 17.02 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)