In [21]:
from datasets import load_dataset

ds = load_dataset("dominguesm/alpaca-data-pt-br")

In [22]:
from transformers import AutoTokenizer

# Example usage of the GPT model
tokenizer = AutoTokenizer.from_pretrained("pierreguillou/gpt2-small-portuguese")

In [23]:
df = ds["train"].to_pandas()

In [24]:
df["concat"] = (
    df["instruction"] + "\n\n" + df["input"].fillna("") + "\n\n" + df["output"]
)

In [25]:
# create a column with count of tokens for each row
df["tokens"] = df["concat"].apply(lambda x: len(tokenizer.encode(x)))

In [26]:
df["tokens"].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

count    51759.000000
mean        74.283371
std         40.517703
min         10.000000
25%         38.000000
50%         67.000000
75%        110.000000
90%        127.000000
95%        136.000000
99%        159.000000
max        838.000000
Name: tokens, dtype: float64

## Train

In [43]:
import torch

from model import GPT, GPTConfig

device = "cuda" if torch.cuda.is_available() else "cpu"

n_experts = 4

config = GPTConfig(
    block_size=128,
    vocab_size=tokenizer.vocab_size,
    n_layer=4,
    n_head=4,
    n_embd=256,
    n_experts=n_experts,
    capacity_factor=n_experts,
    k=2,
    experts_weight=1e-5,
    router_weight=1e-5,
    dropout=0.2,
    bias=True,
)

model = GPT(config)

number of parameters: 22.34M


In [44]:
inputs = tokenizer(
    df["concat"].tolist(),
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=128,
)

input_ids = inputs["input_ids"]
targets = input_ids.clone()

targets[:, :-1] = input_ids[:, 1:]
targets[:, -1] = -1

# convert targets 0 to -1
targets[targets == 0] = -1

# print("input ids:", inputs["input_ids"], "shape:", inputs["input_ids"].shape)

In [45]:
# invert 0 -> 1 and 1 -> 0 in attention mask
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# create a new upper triangular mask
upper_mask = torch.triu(torch.ones(input_ids.shape[1], input_ids.shape[1]), diagonal=1)

# apply the mask to the attention mask
attention_mask = attention_mask.unsqueeze(1) * upper_mask.unsqueeze(0)

# conver to bool
attention_mask = attention_mask.bool()

In [46]:
model(input_ids[:5, :], attention_mask=attention_mask[:5, :, :])[0].shape

Load: tensor([0.4141, 0.3219, 0.6047, 0.6594]), Importance: tensor([0.2371, 0.2186, 0.2720, 0.2723], grad_fn=<MeanBackward1>)
Router loss: 2.2427686417358927e-05, Balance loss: 5.125716597831342e-06
Load: tensor([0.4281, 0.5562, 0.5375, 0.4781]), Importance: tensor([0.2413, 0.2599, 0.2535, 0.2453], grad_fn=<MeanBackward1>)
Router loss: 2.1677575205103494e-05, Balance loss: 5.014181624574121e-06
Load: tensor([0.5344, 0.4609, 0.5172, 0.4875]), Importance: tensor([0.2581, 0.2435, 0.2478, 0.2507], grad_fn=<MeanBackward1>)
Router loss: 2.0801444406970404e-05, Balance loss: 5.0048797675117385e-06
Load: tensor([0.4984, 0.4844, 0.5219, 0.4953]), Importance: tensor([0.2497, 0.2456, 0.2552, 0.2494], grad_fn=<MeanBackward1>)
Router loss: 2.0364239389891736e-05, Balance loss: 5.001861154596554e-06


torch.Size([5, 1, 50257])

### Train loop

In [47]:
# send data to device
model = model.to(device)
input_ids = input_ids[:32, :].to(device)  # limit to 16 for testing
attention_mask = attention_mask[:32, :, :].to(device)
targets = targets[:32, :].to(device)

# get only one sample for each
input_ids = input_ids[0:1, :]
attention_mask = attention_mask[0:1, :, :]
targets = targets[0:1, :]

In [48]:
from tqdm.notebook import tqdm

with torch.no_grad():
    logits, loss = model(input_ids, attention_mask, targets)
    # print("logits:", logits, "shape:", logits.shape)
print("loss:", loss.item() if loss is not None else "N/A")
if loss is not None:
    perplexity = torch.exp(loss)
    print("perplexity:", perplexity.item())

# pred_tokens = logits.argmax(dim=-1)
# tokens = pred_tokens[0].tolist()
# print("predicted tokens:", tokens)
# print("predicted text:", tokenizer.decode(tokens))

optimizer = model.configure_optimizers(
    weight_decay=0.0,
    learning_rate=3e-4,
    betas=(0.9, 0.95),
    device_type="cuda",
)

pbar = tqdm(range(450), desc="Training Epochs")

for epoch in pbar:
    model.train()
    optimizer.zero_grad()
    logits, loss = model(input_ids, attention_mask, targets)
    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        logits, loss = model(input_ids, attention_mask, targets)

    perplexity = torch.exp(loss)
    pbar.set_postfix(loss=loss.item(), perplexity=perplexity.item())

    pred_tokens = logits.argmax(dim=-1)
    tokens = pred_tokens[0].tolist()

# Router loss: 0.0004672443028539419, Balance loss: 0.019999997690320015

Load: tensor([0.3672, 0.2578, 0.6875, 0.6875], device='cuda:0'), Importance: tensor([0.2366, 0.2119, 0.2695, 0.2819], device='cuda:0')
Router loss: 2.315204073966015e-05, Balance loss: 5.206473360885866e-06
Load: tensor([0.4141, 0.4922, 0.6250, 0.4688], device='cuda:0'), Importance: tensor([0.2366, 0.2494, 0.2718, 0.2422], device='cuda:0')
Router loss: 2.27488035307033e-05, Balance loss: 5.041329586674692e-06
Load: tensor([0.5938, 0.4453, 0.4922, 0.4688], device='cuda:0'), Importance: tensor([0.2660, 0.2315, 0.2523, 0.2501], device='cuda:0')
Router loss: 2.1694688257412054e-05, Balance loss: 5.024935035180533e-06
Load: tensor([0.5391, 0.4453, 0.5703, 0.4453], device='cuda:0'), Importance: tensor([0.2565, 0.2382, 0.2643, 0.2411], device='cuda:0')
Router loss: 1.9788792997132987e-05, Balance loss: 5.023880476073828e-06
loss: 10.920313835144043
perplexity: 55288.1484375
using fused AdamW: True


Training Epochs:   0%|          | 0/450 [00:00<?, ?it/s]

Load: tensor([0.3359, 0.2500, 0.6719, 0.7422], device='cuda:0'), Importance: tensor([0.2242, 0.2078, 0.2781, 0.2899], device='cuda:0',
       grad_fn=<MeanBackward1>)
Router loss: 2.291368946316652e-05, Balance loss: 5.292646164889447e-06
Load: tensor([0.3750, 0.5859, 0.6328, 0.4062], device='cuda:0'), Importance: tensor([0.2343, 0.2588, 0.2692, 0.2377], device='cuda:0',
       grad_fn=<MeanBackward1>)
Router loss: 2.1462954464368522e-05, Balance loss: 5.064256583864335e-06
Load: tensor([0.5859, 0.3750, 0.5391, 0.5000], device='cuda:0'), Importance: tensor([0.2614, 0.2258, 0.2582, 0.2546], device='cuda:0',
       grad_fn=<MeanBackward1>)
Router loss: 1.935486579895951e-05, Balance loss: 5.043319561082171e-06
Load: tensor([0.5312, 0.4062, 0.6094, 0.4531], device='cuda:0'), Importance: tensor([0.2539, 0.2302, 0.2690, 0.2468], device='cuda:0',
       grad_fn=<MeanBackward1>)
Router loss: 1.9911925846827216e-05, Balance loss: 5.042031716584461e-06
Load: tensor([0.4062, 0.2812, 0.6172, 0.69

In [49]:
with torch.no_grad():
    generated_ids = model.generate(
        input_ids[:, :1].clone(),
        max_new_tokens=100,
        temperature=0.1,
        top_k=1,
        greedy=True,
    )
generated_text = tokenizer.decode(generated_ids[0].tolist())
_df = df.head(1)
_df["generated_text"] = [
    tokenizer.decode(generated_ids[i].tolist()) for i in range(len(generated_ids))
]
print(f"Generated text after epoch {epoch + 1}: {generated_text}")


Load: tensor([0., 0., 1., 1.], device='cuda:0'), Importance: tensor([0.2433, 0.2368, 0.2698, 0.2502], device='cuda:0')
Router loss: 6.467852472269442e-06, Balance loss: 5.199723091209307e-06
Load: tensor([0., 0., 1., 1.], device='cuda:0'), Importance: tensor([0.2365, 0.2163, 0.2594, 0.2878], device='cuda:0')
Router loss: 9.163914910459425e-06, Balance loss: 5.472066277434351e-06
Load: tensor([1., 0., 0., 1.], device='cuda:0'), Importance: tensor([0.2891, 0.2054, 0.2352, 0.2702], device='cuda:0')
Router loss: 8.695159863236768e-07, Balance loss: 5.593684818450129e-06
Load: tensor([0., 0., 1., 1.], device='cuda:0'), Importance: tensor([0.2189, 0.2460, 0.2634, 0.2716], device='cuda:0')
Router loss: 1.0817028623932856e-06, Balance loss: 5.350668288883753e-06
Load: tensor([0.0000, 0.5000, 0.5000, 1.0000], device='cuda:0'), Importance: tensor([0.2462, 0.2466, 0.2531, 0.2541], device='cuda:0')
Router loss: 5.8445584727451205e-06, Balance loss: 5.039652933191974e-06
Load: tensor([0.5000, 0.500

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _df["generated_text"] = [


In [50]:
_df.head(16).style

Unnamed: 0,instruction,input,output,concat,tokens,generated_text
0,Dê três dicas para se manter saudável.,,1. Coma uma dieta equilibrada e certifique-se de incluir muitas frutas e vegetais. 2. Exercite-se regularmente para manter seu corpo ativo e forte. 3. Durma o suficiente e mantenha um horário de sono consistente.,Dê três dicas para se manter saudável. 1. Coma uma dieta equilibrada e certifique-se de incluir muitas frutas e vegetais. 2. Exercite-se regularmente para manter seu corpo ativo e forte. 3. Durma o suficiente e mantenha um horário de sono consistente.,62,Dê três dicas para se manter saudável. 1. Coma uma dieta equilibrada e certifique-se de incluir muitas frutas e vegetais. 2. Exercite-se regularmente para manter seu corpo ativo e forte. 3. Durma o suficiente e mantenha um horário de sono consistente. 2. 3. 2. 2. 3. 2. Durma o suficiente e e e mantenha um horário de incluir muitas frutas e forte. Durma o suficiente e vegetais. 3. 3
