In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
from building_babel.tokenizers import LlamaTokenizer
from datasets import load_dataset
import numpy as np
import torch
from tqdm.autonotebook import tqdm
import pandas as pd
import building_babel.model as bbm
import torch.nn.functional as F
from logging import basicConfig, INFO

In [3]:
basicConfig(level=INFO)

In [4]:
df = pd.read_parquet("/Users/spott/.cache/huggingface/hub/datasets--roneneldan--TinyStories/snapshots/df479c2caefda65cbb295f6f9d3e61b0be8d6593/data/train-00000-of-00004-2d5a1467fff1081b.parquet")

In [5]:
lt = LlamaTokenizer("/Users/spott/Models/llama-2-tokenizer/tokenizer.model")

INFO:__name__:reloaded SentencePiece model from /Users/spott/Models/llama-2-tokenizer/tokenizer.model
INFO:__name__:#words: 32000 - BOS ID: 1 - EOS ID: 2


In [6]:
bins = torch.zeros((lt.n_words))

In [7]:
# for d in tqdm(df['text']):
#     bins += np.bincount(lt.encode(d,False, False), minlength=lt.n_words)

In [8]:
# sibins = torch.argsort(bins, descending=True)
# sbins = bins[sibins]

In [9]:
# tokens = [lt.sp_model.IdToPiece(i) for i in sibins.tolist()]

In [10]:
# len(bins.nonzero())

In [11]:
# tokens[:150]

In [12]:
df['text'][10]

'Once upon a time, there was a big car named Dependable. He had a very important job. Dependable would take a family to the park every day. The family had a mom, dad, and a little girl named Lily. They all had a lot of love for each other.\n\nOne day, when they got to the park, they saw a big sign that said, "Fun Race Today!" The family was very excited. They knew that Dependable was very fast and could win the race. So, they decided to join the race.\n\nThe race started, and Dependable went very fast. The other cars tried to catch up, but Dependable was too quick. In the end, Dependable won the race! The family was so happy and proud of their car. They knew that their love for each other and their trust in Dependable made them win the race. And from that day on, they had even more fun at the park, knowing that they had the fastest and most dependable car around.'

In [13]:
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df['text']
        self.max_len = df['text'].apply(len).max()

    def __getitem__(self, i):
        tokenized = torch.tensor(lt.encode(self.df[i], True, True),  dtype=torch.int64)
        pad = (0,self.max_len - len(tokenized))
        return F.pad(tokenized, pad, "constant", 0) # we pad with 0s, but it really doesn't matter, because we have a stop token...

    def __len__(self):
        return len(self.df)

In [23]:
def sample(x):
    return torch.multinomial(x, 1)

In [24]:
def generate(t, deterministic=False):
    seq = torch.tensor([[lt.bos_id]])
    for i in range(18):
        if deterministic:
            next_token = t(seq)[:,-1].softmax(dim=-1).argmax(dim=-1).view(-1,1)
        else:
            next_token = sample(t(seq)[:,-1].softmax(dim=-1)).view(-1,1)
        if next_token[0,-1] < 2:
            break
        seq = torch.concat([seq, next_token], dim=-1)
    print(lt.decode(seq[:,1:].tolist()))

In [16]:
sds = SimpleDataset(df)

In [17]:
sds.max_len

5499

In [18]:
c = bbm.TransformerConfig(128, 1, lt.n_words, head_dim=128, max_seq_len=5499)
t = bbm.Transformer(c)

In [19]:
dl = torch.utils.data.DataLoader(sds, batch_size=10, shuffle=True)

In [20]:
optim = torch.optim.Adam(t.parameters(), lr=3e-5)

In [21]:
for i in range(20):
    print(i)
    for b in tqdm(dl):
        optim.zero_grad()
        out = t(b[...,:-1])

        loss = F.cross_entropy(out.transpose(1,2), b[...,1:])
        
        loss.backward()
        optim.step()
    with torch.no_grad():
        generate(t, deterministic=True)

0


  0%|          | 0/52993 [00:00<?, ?it/s]

INFO:__name__:transformer block
INFO:__name__:attention forward
INFO:__name__:in growable_linear
INFO:__name__:building output
INFO:__name__:0 output: torch.Size([10, 5498, 128]), x.shape=torch.Size([10, 5498, 128]), self.weight_splits[str(i)].shape=torch.Size([128, 128]), in_dim=128, prev_in_dim=0, out_dim=128, prev_out_dim=0
INFO:__name__:in growable_linear
INFO:__name__:building output
INFO:__name__:0 output: torch.Size([10, 5498, 128]), x.shape=torch.Size([10, 5498, 128]), self.weight_splits[str(i)].shape=torch.Size([128, 128]), in_dim=128, prev_in_dim=0, out_dim=128, prev_out_dim=0
INFO:__name__:in growable_linear
INFO:__name__:building output
INFO:__name__:0 output: torch.Size([10, 5498, 128]), x.shape=torch.Size([10, 5498, 128]), self.weight_splits[str(i)].shape=torch.Size([128, 128]), in_dim=128, prev_in_dim=0, out_dim=128, prev_out_dim=0
INFO:__name__:weights
INFO:__name__:after rotary emb
INFO:__name__:after attention
INFO:__name__:in growable_linear
INFO:__name__:building ou

KeyboardInterrupt: 

In [25]:
with torch.no_grad():
        generate(t, deterministic=True)

INFO:__name__:transformer block
INFO:__name__:attention forward
INFO:__name__:in growable_linear
INFO:__name__:building output
INFO:__name__:0 output: torch.Size([1, 1, 128]), x.shape=torch.Size([1, 1, 128]), self.weight_splits[str(i)].shape=torch.Size([128, 128]), in_dim=128, prev_in_dim=0, out_dim=128, prev_out_dim=0
INFO:__name__:in growable_linear
INFO:__name__:building output
INFO:__name__:0 output: torch.Size([1, 1, 128]), x.shape=torch.Size([1, 1, 128]), self.weight_splits[str(i)].shape=torch.Size([128, 128]), in_dim=128, prev_in_dim=0, out_dim=128, prev_out_dim=0
INFO:__name__:in growable_linear
INFO:__name__:building output
INFO:__name__:0 output: torch.Size([1, 1, 128]), x.shape=torch.Size([1, 1, 128]), self.weight_splits[str(i)].shape=torch.Size([128, 128]), in_dim=128, prev_in_dim=0, out_dim=128, prev_out_dim=0
INFO:__name__:weights
INFO:__name__:after rotary emb
INFO:__name__:after attention
INFO:__name__:in growable_linear
INFO:__name__:building output
INFO:__name__:0 out

['населения остров scoreutch coup mantenalert externas intermediate беorage Ribultan Scouseelijk rör alias']
