In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, losses

# Parameters:

- MAX_VOCAB : how can the model interpret

- CONTEXT_WIN : maximum number of tokens the model can look at at once

- EMBED_WIN : the size of each token's vector

- HEADS : number of heads for multi-head attention (Each head learns a different type of relationship)

- FEED_FOWARD : expand to this param size (larger) to learn more complex transformation (and later it is squished back by Dense)

- TRANSFORMER_BLOCKS : blocks = deeper reasoning

- BATCH_SIZE : train param

- EPOCHS : train param

In [15]:
MAX_VOCAB = 1000
CONTEXT_WIN = 20
EMBED_WIN = 64
HEADS = 2
FEED_FOWARD = 128 #4x embed_win
TRANSFORMER_BLOCKS = 2
BATCH_SIZE = 16
EPOCHS = 70

train_text = [
"Machine Learning is very hard",
"I use Arch btw",
"I need a job",
"GitHub is goated",
"Saber best waifu :3",
"Skibidi Toilet",
"This is a test"
]

# Tokenizer
- This is a reprocessing layer that simply split text by white space and assign it with an interger IDs

- `output_sequence_length=CONTEXT_WIN` : apply "0" padding so that every seqs are the same length (20 in this case)

# Data prep

slicing the last word for input and last word for output
-> this will then force the model to learn the connection of words and predict the next token :0

In [4]:
tokenizer = layers.TextVectorization(
    max_tokens=MAX_VOCAB,
    output_mode="int",
    output_sequence_length=CONTEXT_WIN,
    standardize=None,
    split="whitespace"
)
tokenizer.adapt(train_text) #build vocab
vocab = tokenizer.get_vocabulary() #convert to int IDs

print(vocab)

#next token prediction
def prep_data(train_text):
  seq = tokenizer(train_text)
  x_input = seq[:, :-1]
  y_target = seq[:, 1:]

  return x_input, y_target

x_train, y_train = prep_data(train_text)

['', '[UNK]', np.str_('is'), np.str_('a'), np.str_('I'), np.str_('waifu'), np.str_('very'), np.str_('use'), np.str_('test'), np.str_('need'), np.str_('job'), np.str_('hard'), np.str_('goated'), np.str_('btw'), np.str_('best'), np.str_('Toilet'), np.str_('This'), np.str_('Skibidi'), np.str_('Saber'), np.str_('Machine'), np.str_('Learning'), np.str_('GitHub'), np.str_('Arch'), np.str_(':3')]


# Perplexity metric
*overall it shows how many choises is the model confused between*
- `crossentropy(true,pred)` cross entropy loss with true_labels predicted value
- then `perplexity` = e^`crossentropy`

# Token Position Embedding
Once the text is tokenized, each token needs to be represented in a way that captures not just the token but also its meaning and position (relation to other tokens)

*This is how it works visually:*

```
Input tokens:     ["the", "dog", "runs"]  →  IDs: [4, 27, 13]
Token embeddings:  [row_v4,  ...  ,row_v27, ...  ,row_v13  ]  
Position embeddings:[row_p0,  ...  ,row_p1,  ...  ,row_p2  ]  
Output:           [v4+p0, v27+p1, v13+p2]
```

In [5]:
def perplexity(true,pred):
  return tf.exp(tf.reduce_mean(losses.sparse_categorical_crossentropy(true,pred)))

class TokenPositionEmbedding(layers.Layer):
  def __init__(self, context_win, max_vocab, embed_dim, **kwargs) -> None:
    super().__init__()
    self.token_embed = layers.Embedding(input_dim = max_vocab, output_dim=embed_dim)
    self.position_embed = layers.Embedding(input_dim = context_win, output_dim=embed_dim)

  def call(self, x):
    context_win = tf.shape(x)[-1]
    positions = self.position_embed(tf.range(start=0, limit=context_win, delta=1))
    return self.token_embed(x) + positions


# Transformer blocks
Overall tructure:
```
Input -> Multi-head attention -> Add Residual -> LayerNorm
-> Feed Forward ->Add Residual -> LayerNorm -> Output
```
- `multi-head attention`: each head works on an equal slice of the embedding.

- `feed forward network` is a small 2 layers network:

It first expands the dimension to feed_forward, applies ReLU, then squishes back down to embed_dim

- `layer norm and dropout`: norm stabilizes training and dropout randomly zeros out 10% of values during training to prevent overfitting.

*notes: self-attention is basically seeing how other words are relevant to the current word*

In [6]:
class TransformerBlock(layers.Layer):
  def __init__(self, embed_dim, heads, feed_forward, **kwargs) -> None:
    super().__init__()
    self.attention = layers.MultiHeadAttention(num_heads=heads, key_dim=embed_dim // heads)
    self.feed_foward_net = models.Sequential([layers.Dense(feed_forward, activation="relu"), layers.Dense(embed_dim)])
    self.norm1 = layers.LayerNormalization(epsilon=1e-4)
    self.norm2 = layers.LayerNormalization(epsilon=1e-4)
    self.drop1 = layers.Dropout(0.1)
    self.drop2 = layers.Dropout(0.1)

  def call(self, inputs, training = False):
    #Self Attention                   query    key/value
    attention_output = self.attention(inputs, inputs, use_causal_mask=True)
    attention_output = self.drop1(attention_output, training=training)
    output = self.norm1(inputs + attention_output)

    #Feed Forward Network
    feed_forward_output = self.feed_foward_net(output)
    feed_forward_output = self.drop2(feed_forward_output, training=training)

    return self.norm2(output + feed_forward_output)

# miniLM

Now we put everything into this `miniLM` class + a text generation function

a little overview of our smoll brain:
- `embedding layer` converts token IDs into vectors
- `transformer blocks` to process the sequence
- `dense out` gives us the logits or "score" for each word in the vocab

text generation process:
- tokenize input (also remove paddings)
- ran the loop for a fixed length
  - `tokens[-CONTEXT_WIN:]` only have context of the previous word
  - run the model -> get logits and apply temperature
  - apply top_k to get the top k-th prediction
  - softmax to convert in into probability
  -> randomly pick a word

In [7]:
class miniLM(models.Model):
  def __init__(self, context_win, max_vocab, embed_dim, heads, feed_forward, blocks, **kwargs) -> None:
    super().__init__(**kwargs)
    self.embed_layer = TokenPositionEmbedding(context_win, max_vocab, embed_dim)
    self.transformer_blocks = [TransformerBlock(embed_dim, heads, feed_forward) for _ in range(blocks)]
    self.dense_out = layers.Dense(max_vocab)

  def call(self, inputs, training=False):
      x = self.embed_layer(inputs)            # (batch, seq_len, embed_dim)
      for block in self.transformer_blocks:
        x = block(x, training=training)       # (batch, seq_len, embed_dim)
      return self.dense_out(x)                # (batch, seq_len, max_vocab)


  def gen(model, prompt, length = 6, temperature= 1.0, top_k = 5):
    input_tensor = tokenizer([prompt])
    tokens = [token for token in input_tensor.numpy()[0] if token != 0]

    gen_text = prompt
    for _ in range(length):
      context_token = tokens[-CONTEXT_WIN:]
      input_data = tf.convert_to_tensor([context_token])

      preds = model(input_data, training=False)
      next_logits = preds[0, -1, :]
      next_logits /= (temperature + 1e-7)
      top_val,  top_idx = tf.math.top_k(next_logits, k=top_k)
      top_prob = tf.nn.softmax(top_val).numpy()

      next_idx = np.random.choice(top_idx.numpy(), p=top_prob)
      if next_idx == 0 and len(tokens) >0:
        next_idx = top_idx.numpy()[1]

      tokens.append(next_idx)
      gen_text += " " + vocab[next_idx]

    return gen_text

In [16]:
skibidi_model = miniLM(CONTEXT_WIN, len(vocab), EMBED_WIN, HEADS, FEED_FOWARD, TRANSFORMER_BLOCKS)
skibidi_model.compile(optimizer="adam", loss=losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[perplexity])
skibidi_model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1)

Epoch 1/70
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11s/step - loss: 3.8777 - perplexity: 20790.5938
Epoch 2/70
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 1.6777 - perplexity: 21.4380
Epoch 3/70
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 0.7580 - perplexity: 14.5236
Epoch 4/70
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - loss: 0.5571 - perplexity: 11.4399
Epoch 5/70
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 0.4928 - perplexity: 9.2738
Epoch 6/70
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 0.4532 - perplexity: 7.7594
Epoch 7/70
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 0.4088 - perplexity: 6.4648
Epoch 8/70
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - loss: 0.3867 - perplexity: 6.3210
Epoch 9/70
[1m1/1[0m [32m━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7c91105cbd10>

In [17]:
test_input = input("Enter a prompt: ")
print(skibidi_model.gen(test_input))


Enter a prompt: i like
i like waifu :3 Machine Learning is very


In [20]:
#save full model, weight and tokenized vocab
skibidi_model.save("skibidi_model.keras")
skibidi_model.save_weights("skibidi_model.weights.h5")
import pickle
with open("vocab.pkl", "wb") as f:
    pickle.dump(vocab, f)