In [1]:
from llama import LLaMa
from datasets import load_dataset
from huggingface_hub import HfFolder, login
from transformers import Trainer, TrainingArguments, AutoTokenizer
from torch import nn
from torch.utils.data import DataLoader, Dataset

import torch

In [2]:
login(token="")
dataset_id = "ashaba1in/small_openwebtext"
dataset = load_dataset(dataset_id)

print("Dataset size", len(dataset['train']['text']))
print(dataset['train']['text'][0])

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/legend/.cache/huggingface/token
Login successful
Dataset size 1000000
Port-au-Prince, Haiti (CNN) -- Earthquake victims, writhing in pain and grasping at life, watched doctors and nurses walk away from a field hospital Friday night after a Belgian medical team evacuated the area, saying it was concerned about security.

The decision left CNN Chief Medical Correspondent Sanjay Gupta as the only doctor at the hospital to get the patients through the night.

CNN initially reported, based on conversations with some of the doctors, that the United Nations ordered the Belgian First Aid and Support Team to evacuate. However, Belgian Chief Coordinator Geert Gijs, a doctor who was at the hospital with

In [3]:
tokenizer_id ="mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1')

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    

In [4]:
len(tokenizer(dataset['train']['text'][4])['input_ids'])

705

In [5]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [6]:
cropped_dataset = dataset['train'].select(range(1000))

In [7]:
cropped_dataset

Dataset({
    features: ['text'],
    num_rows: 1000
})

In [12]:
MAX_SEQ_LEN = 1024
PAD_ID = tokenizer.eos_token_id

def tokenize(batch):
    return tokenizer(batch['text'])

class MyDataset(Dataset):
    def __init__(self, dataset):
        super().__init__()
        self.tokenized_data = dataset.map(tokenize, batched=True, remove_columns=['text'])['input_ids']

    def __getitem__(self, index):
        return self.tokenized_data[index]

    def __len__(self):
        return len(self.tokenized_data)

def collate_fn(batch):
    flatten = []
    for text in batch:
        flatten += text
    target_size = len(batch) * MAX_SEQ_LEN
    if len(flatten) < target_size:
        flatten += [PAD_ID] * (target_size - len(flatten))
    else:
        flatten = flatten[:target_size]
    
    flatten = torch.Tensor(flatten).to(torch.long).view(len(batch), MAX_SEQ_LEN)
    return flatten


my_data = MyDataset(cropped_dataset)
train_dataloader = DataLoader(my_data, batch_size=1, collate_fn=collate_fn)

In [13]:
device = 'cpu'

In [14]:
import torch.nn.functional as F
from tqdm.notebook import tqdm
import numpy as np
from collections import defaultdict

def compute_loss(criterion, logits: torch.Tensor, labels: torch.Tensor, pad_id):
    logits = logits.reshape(-1, 32000)
    labels = labels.view(-1)

    loss = criterion(logits, labels, ignore_index=pad_id)
    return loss

def train_epoch(model, criterion, optimizer, train_loader, epoch):
    loss_log = []
    model.train()
    
    for data in tqdm(train_loader, desc=f"Training Epoch {epoch}"):
        data = data.to(device) # batch_size, seq_len

        optimizer.zero_grad()
        print(1)
        out = model(data) # batch_size, seq_len, vocab_size
        loss = compute_loss(criterion, out[:, :-1], data[:, 1:].clone(), PAD_ID)
        loss_log.append(loss.item())
        
        loss.backward()
        optimizer.step()

    return loss_log

def train(model, criterion, optimizer, n_epochs, train_loader):
    print(model)
    for epoch in range(n_epochs):
        
        train_loss = train_epoch(model, criterion, optimizer, train_loader, epoch)
        print(f"Train loss: {np.mean(train_loss)}")

vocab_size = tokenizer.vocab_size

model = LLaMa(vocab_size, 3, MAX_SEQ_LEN, 768, 8)
criterion = F.cross_entropy
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, betas=(0.9, 0.95), weight_decay=0.1)
n_epochs = 5
train(model, criterion, optimizer, n_epochs, train_dataloader)

LLaMa(
  (embed): Embedding(32000, 768)
  (rmsnorm): RMSNorm()
  (blocks): Sequential(
    (LLaMa Block 0): LLaMaBlock(
      (rms_attn): RMSNorm()
      (attention): MultiHeadAttention(
        (w_q): Linear(in_features=768, out_features=768, bias=False)
        (w_k): Linear(in_features=768, out_features=768, bias=False)
        (w_v): Linear(in_features=768, out_features=768, bias=False)
        (shuffler): Linear(in_features=768, out_features=768, bias=False)
      )
      (rms_ffn): RMSNorm()
      (swiglu): SwiGLUFeedForward(
        (U): Linear(in_features=768, out_features=2048, bias=True)
        (W): Linear(in_features=768, out_features=2048, bias=True)
        (V): Linear(in_features=2048, out_features=768, bias=True)
        (swish): Swish(
          (sigmoid): Sigmoid()
        )
      )
    )
    (LLaMa Block 1): LLaMaBlock(
      (rms_attn): RMSNorm()
      (attention): MultiHeadAttention(
        (w_q): Linear(in_features=768, out_features=768, bias=False)
        (w_k)

Training Epoch 0:   0%|          | 0/1000 [00:00<?, ?it/s]

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


KeyboardInterrupt: 

: 

In [81]:
tokenizer.vocab_size

32000

In [85]:
device = 'cpu'

In [91]:
model.__str__()

'LLaMa(\n  (embed): Embedding(32000, 758)\n  (rmsnorm): RMSNorm()\n  (blocks): Sequential(\n    (LLaMa Block 0): LLaMaBlock(\n      (rms_attn): RMSNorm()\n      (attention): MultiHeadAttention(\n        (w_q): Linear(in_features=758, out_features=758, bias=False)\n        (w_k): Linear(in_features=758, out_features=758, bias=False)\n        (w_v): Linear(in_features=758, out_features=758, bias=False)\n        (shuffler): Linear(in_features=758, out_features=758, bias=False)\n      )\n      (rms_ffn): RMSNorm()\n      (swiglu): SwiGLUFeedForward(\n        (U): Linear(in_features=758, out_features=2021, bias=True)\n        (W): Linear(in_features=758, out_features=2021, bias=True)\n        (V): Linear(in_features=2021, out_features=758, bias=True)\n        (swish): Swish(\n          (sigmoid): Sigmoid()\n        )\n      )\n    )\n    (LLaMa Block 1): LLaMaBlock(\n      (rms_attn): RMSNorm()\n      (attention): MultiHeadAttention(\n        (w_q): Linear(in_features=758, out_features=758,

In [1]:
import torch

mask = torch.full((1, 1, 5, 5), float("-inf"))
mask = torch.triu(mask, diagonal=0 + 1)

In [2]:
mask

tensor([[[[0., -inf, -inf, -inf, -inf],
          [0., 0., -inf, -inf, -inf],
          [0., 0., 0., -inf, -inf],
          [0., 0., 0., 0., -inf],
          [0., 0., 0., 0., 0.]]]])