In [None]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader

# Load Wikipedia dataset
dataset = load_dataset("wikipedia", "20240401", split='train')
tokenizer_path = 'tokenizer.json'  # Update this path to your tokenizer.json

# Load your tokenizer
from tokenizer import Tokenizer  # Assuming your tokenizer script is named tokenizer.py
tokenizer = Tokenizer(tokenizer_path)

# Function to tokenize the text
def tokenize_function(examples):
    return {'input_ids': tokenizer.encode(examples['text'], bos=True, eos=True)}

# Apply the tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format('torch', columns=['input_ids'])

# DataLoader to batch data
train_dataloader = DataLoader(tokenized_datasets, batch_size=32, shuffle=True)


In [None]:
from model import Transformer, ModelArgs

# Define model parameters (adjust according to your needs)
model_args = ModelArgs(
    vocab_size=tokenizer.get_vocab_size(),
    dim=512,
    n_layers=6,
    n_heads=8,
    ffn_dim_multiplier=4
)

# Instantiate the model
model = Transformer(model_args)


In [None]:
from torch.optim import AdamW

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
model.train()
for epoch in range(1):  # number of epochs
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        outputs = model(input_ids)
        loss = outputs.loss  # Assuming your model's forward method returns a 'loss' attribute
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch}, Loss: {loss.item()}")
torch.save(model.state_dict(), 'llm_model.pth')
