In [None]:
from src.data.data import BabyJoeyDataset, BabyJoeyDataLoader
from src.model.model import BabyJoeyModel
from dataclasses import dataclass
import torch
from torch.optim import AdamW
import torch.nn.functional as F

@dataclass
class ModelConfig:
    vocab_size: int
    n_embd: int
    n_head: int
    n_layers: int
    max_seq_len: int
    padding_idx: int  # Index of the padding token
    dropout_rate: float = 0.1  # Default dropout rate

# Sample configuration
config = ModelConfig(
    vocab_size=50257,  # Example vocabulary size
    n_embd=768,
    n_head=12,
    n_layers=12,
    max_seq_len=1024,
    padding_idx=50256,  # Padding token index
    dropout_rate=0.1
)

dataset_instance = BabyJoeyDataset()
training_dataset, validation_dataset = dataset_instance.load_or_create_datasets()

data_loader_instance = BabyJoeyDataLoader(training_dataset, validation_dataset)
train_loader, val_loader = data_loader_instance.get_dataloaders()

# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BabyJoeyModel(config).to(device)
model.train()

# Use AdamW optimizer
optimizer = AdamW(model.parameters(), lr=3e-4)  # Example learning rate

# Simple training loop
num_epochs = 5
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    for step, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)  # [batch_size, seq_len]

        # Forward pass
        logits = model(input_ids)  # [batch_size, seq_len, vocab_size]

        # Shift inputs and logits for causal language modeling
        shifted_logits = logits[:, :-1, :].contiguous()   # [batch_size, seq_len-1, vocab_size]
        shifted_input_ids = input_ids[:, 1:].contiguous()  # [batch_size, seq_len-1]

        # Compute loss with label smoothing and ignoring padding tokens
        loss = F.cross_entropy(
            shifted_logits.view(-1, config.vocab_size),
            shifted_input_ids.view(-1),
            ignore_index=config.padding_idx,
            label_smoothing=0.1
        )

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (step + 1) % 10 == 0:
            print(f"Step {step+1}, Loss: {loss.item():.4f}")

    # Validation loop
    model.eval()
    val_loss = 0.0
    val_steps = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            logits = model(input_ids)

            shifted_logits = logits[:, :-1, :].contiguous()
            shifted_input_ids = input_ids[:, 1:].contiguous()

            val_batch_loss = F.cross_entropy(
                shifted_logits.view(-1, config.vocab_size),
                shifted_input_ids.view(-1),
                ignore_index=config.padding_idx,
                label_smoothing=0.1
            )

            val_loss += val_batch_loss.item()
            val_steps += 1

    avg_val_loss = val_loss / max(val_steps, 1)
    print(f"Validation Loss: {avg_val_loss:.4f}")
    model.train()

    # Save a checkpoint after each epoch
    checkpoint_path = f"baby_joey_checkpoint_epoch_{epoch+1}.pt"
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'val_loss': avg_val_loss,
        'config': config,
    }, checkpoint_path)
    print(f"Checkpoint saved at {checkpoint_path}")


  from .autonotebook import tqdm as notebook_tqdm
  training_dataset = torch.load(self.train_file)
  validation_dataset = torch.load(self.valid_file)


Input IDs shape: torch.Size([2, 512])
Epoch 1/3
Step 10, Loss: 10.8557
Step 20, Loss: 10.4595
Step 30, Loss: 9.8785
Step 40, Loss: 9.0284
Step 50, Loss: 8.7108
Step 60, Loss: 7.8799
Step 70, Loss: 7.8541
Step 80, Loss: 8.2249
Step 90, Loss: 8.7099
Step 100, Loss: 9.1176
Step 110, Loss: 8.3448
Step 120, Loss: 8.0079
Step 130, Loss: 9.0371
Step 140, Loss: 8.2543
Step 150, Loss: 8.4205
Step 160, Loss: 8.5938
Step 170, Loss: 7.4113
Step 180, Loss: 7.4567
Step 190, Loss: 9.2399
Step 200, Loss: 7.2231
Step 210, Loss: 7.1624
Step 220, Loss: 9.2036
Step 230, Loss: 7.9176
Step 240, Loss: 7.5024
Step 250, Loss: 8.0537
Step 260, Loss: 5.3396
Step 270, Loss: 7.8347
Step 280, Loss: 7.8839
Step 290, Loss: 7.3701
Step 300, Loss: 8.0765
Step 310, Loss: 7.1636
Step 320, Loss: 7.9542
Step 330, Loss: 7.9724
Step 340, Loss: 8.3047
Step 350, Loss: 7.5592
Step 360, Loss: 8.4055
Step 370, Loss: 7.3380
Step 380, Loss: 7.5311
Step 390, Loss: 8.4183
Step 400, Loss: 8.5113
Step 410, Loss: 7.5828
Step 420, Loss: 

In [None]:
from src.data.data import BabyJoeyDataset, BabyJoeyDataLoader
from src.model.model import BabyJoeyModel
from dataclasses import dataclass
import torch
from torch.optim import Adam
import torch.nn.functional as F


In [None]:



@dataclass
class ModelConfig:
    vocab_size: int
    n_embd: int
    n_head: int
    n_layers: int
    max_seq_len: int
    padding_idx: int  # Index of the padding token
    dropout_rate: float = 0.1  # Default dropout rate

# Sample configuration
config = ModelConfig(
    vocab_size=50257,  # Example vocabulary size
    n_embd=512,
    n_head=8,
    n_layers=1,
    max_seq_len=512,
    padding_idx=50256,  # Padding token index
    dropout_rate=0.1
)
 

In [None]:
dataset_instance = BabyJoeyDataset()
training_dataset, validation_dataset = dataset_instance.load_or_create_datasets()

In [None]:
data_loader_instance = BabyJoeyDataLoader(training_dataset, validation_dataset)
train_loader, val_loader = data_loader_instance.get_dataloaders()


In [None]:
samples = next(iter(train_loader))

In [None]:
sample = samples['input_ids'][0]

In [None]:
sample

In [None]:
decoded_text = dataset_instance.tokenizer.decode(sample, skip_special_tokens=True)
print(decoded_text)

In [None]:

# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BabyJoeyModel(config).to(device)
# model.train()


In [None]:

# Set up optimizer
optimizer = Adam(model.parameters(), lr=3e-4)  # Example learning rate


In [None]:


# Simple training loop
num_epochs = 3
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    for step, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)  # [batch_size, seq_len]

        # Forward pass
        logits = model(input_ids)  # [batch_size, seq_len, vocab_size]

        # Shift inputs and logits for causal language modeling
        # Targets: predict input_ids[t] from input_ids[:t]
        # We drop the last token from logits and the first token from targets
        shifted_logits = logits[:, :-1, :].contiguous()   # [batch_size, seq_len-1, vocab_size]
        shifted_input_ids = input_ids[:, 1:].contiguous()  # [batch_size, seq_len-1]

        # Flatten for cross-entropy
        loss = F.cross_entropy(
            shifted_logits.view(-1, config.vocab_size),
            shifted_input_ids.view(-1)
        )

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (step + 1) % 10 == 0:
            print(f"Step {step+1}, Loss: {loss.item():.4f}")

    # Validation loop (optional)
    model.eval()
    val_loss = 0.0
    val_steps = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            logits = model(input_ids)

            shifted_logits = logits[:, :-1, :].contiguous()
            shifted_input_ids = input_ids[:, 1:].contiguous()

            loss = F.cross_entropy(
                shifted_logits.view(-1, config.vocab_size),
                shifted_input_ids.view(-1)
            )

            val_loss += loss.item()
            val_steps += 1
    avg_val_loss = val_loss / max(val_steps, 1)
    print(f"Validation Loss: {avg_val_loss:.4f}")
    model.train()