<a href="https://colab.research.google.com/github/shernee/06_CMPE297/blob/main/NanoGPT_Tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
batch_size = 16
block_size = 32
max_iters = 1000
eval_interval = 100
learning_rate = 1e-3
eval_iters = 10
n_embd = 64
n_head = 8
n_layer = 8

In [4]:
input_file_path = "/content/drive/MyDrive/297_data/Sorcerer's_stone.txt"

In [5]:
class Dataset:
  def __init__(self):
    self.vocab_size = 0
    self.train_data = tf.constant([], dtype=tf.int32)
    self.val_data = tf.constant([], dtype=tf.int32)

  def read_dataset(self):
    with tf.io.gfile.GFile(input_file_path, 'r') as f:
        self.data = f.read()

  def prepare_dataset(self):
    chars = sorted(list(set(self.data)))
    self.vocab_size = len(chars)
    char_to_int = {ch: i for i, ch in enumerate(chars)}
    int_to_char = {i: ch for i, ch in enumerate(chars)}
    self.encode = lambda s: [char_to_int[c] for c in s]
    self.decode = lambda l: ''.join([int_to_char[i] for i in l])

  def data_split(self):
    data_tensor = tf.convert_to_tensor(self.encode(self.data), dtype=tf.int32)
    n = int(0.8 * len(data_tensor))
    self.train_data = data_tensor[:n]
    self.val_data = data_tensor[n:]

  def get_batch(self, split):
    data = self.train_data if split == 'train' else self.val_data
    data_length = tf.shape(data)[0]

    ix = tf.random.uniform([batch_size], minval=0, maxval=data_length - block_size, dtype=tf.int32)

    start_indices = tf.expand_dims(ix, -1)
    range_indices = tf.range(block_size, dtype=tf.int32)
    indices = start_indices + range_indices

    x = tf.gather(data, indices)

    y_indices = start_indices + 1 + range_indices
    y = tf.gather(data, y_indices)

    return x, y

In [6]:
class Loss:
  def estimate_loss(self):
    out = {}
    for split in ['train', 'val']:
      total_loss = tf.zeros([])
      for k in range(eval_iters):
          X, Y = datasetObj.get_batch(split)
          _, loss = model(X, Y)
          total_loss += loss
      out[split] = total_loss / eval_iters
    return out

lossObj = Loss()

In [7]:
class AttentionHead(tf.keras.layers.Layer):
  def __init__(self, head_size, n_embd, block_size):
    super(AttentionHead, self).__init__()
    self.key = tf.keras.layers.Dense(head_size, use_bias=False, input_shape=(n_embd,))
    self.query = tf.keras.layers.Dense(head_size, use_bias=False, input_shape=(n_embd,))
    self.value = tf.keras.layers.Dense(head_size, use_bias=False, input_shape=(n_embd,))
    self.tril = tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0)

  def call(self, x):
    B, T, C = x.shape
    k = self.key(x)
    q = self.query(x)
    w = tf.matmul(q, k, transpose_b=True) * C**-0.5
    w = tf.where(self.tril[:T, :T] == 0, float('-inf'), w)
    w = tf.nn.softmax(w, axis=-1)

    v = self.value(x)
    out = tf.matmul(w, v)

    return out

In [8]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, num_heads, head_size, n_embd, block_size):
    super(MultiHeadAttention, self).__init__()
    self.heads = [AttentionHead(head_size, n_embd, block_size) for _ in range(num_heads)]
    self.proj = tf.keras.layers.Dense(n_embd)

  def call(self, x):
    out = tf.concat([h(x) for h in self.heads], axis=-1)
    return self.proj(out)

In [9]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, n_embd):
    super(FeedForward, self).__init__()
    self.net = tf.keras.Sequential([
        tf.keras.layers.Dense(4 * n_embd),
        tf.keras.layers.ReLU(),
        tf.keras.layers.Dense(n_embd)
    ])

  def call(self, x):
    return self.net(x)

In [10]:
class TransformerBlock(tf.keras.layers.Layer):
  def __init__(self, n_embd, n_head, block_size):
    super(TransformerBlock, self).__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size, n_embd, block_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.ln2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

  def call(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

In [11]:
class NanoGPT(tf.keras.Model):
  def __init__(self):
    super(NanoGPT, self).__init__()
    self.token_embedding_table = tf.keras.layers.Embedding(datasetObj.vocab_size, n_embd)
    self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embd)
    self.blocks = tf.keras.Sequential([TransformerBlock(n_embd, n_head, block_size) for _ in range(n_layer)])
    self.ln_f = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.lm_head = tf.keras.layers.Dense(datasetObj.vocab_size)

  def call(self, idx, targets=None, training=False):
    shape = tf.shape(idx)
    B = shape[0]
    T = shape[1]
    tok_emb = self.token_embedding_table(idx)
    pos_emb = self.position_embedding_table(tf.range(T))
    x = tok_emb + pos_emb
    x = self.blocks(x)
    x = self.ln_f(x)
    logits = self.lm_head(x)

    if targets is None:
        loss = None
    else:
        loss = tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True)
        loss = tf.reduce_mean(loss)

    return logits, loss

  def generate(self, idx, max_new_tokens, batch_size=4):
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:]
      logits, _ = self(idx_cond)
      logits = logits[:, -1, :]

      idx_next = tf.random.categorical(logits, num_samples=batch_size, dtype=tf.int32)

      idx = tf.concat([idx, idx_next], axis=1)

    return idx

In [12]:
def generateNext():
  context = tf.zeros((1, 1), dtype=tf.int32)
  generated_seq = model.generate(context, max_new_tokens=500)
  print(datasetObj.decode(generated_seq[0].numpy()))

In [None]:
if __name__ == '__main__':
  datasetObj = Dataset()
  datasetObj.read_dataset()
  datasetObj.prepare_dataset()
  datasetObj.data_split()

  model = NanoGPT()

  optimizer = tf.keras.optimizers.AdamW(learning_rate=learning_rate)

  for iter in range(max_iters):
    print("Iteration = ", iter)
    if iter % eval_interval == 0 or iter == max_iters - 1:
      losses = lossObj.estimate_loss()
      print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = datasetObj.get_batch('train')

    with tf.GradientTape() as tape:
        _, loss = model(xb, yb)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

Iteration =  0
step 0: train loss 4.6827, val loss 4.6474




Iteration =  1
Iteration =  2
Iteration =  3
Iteration =  4
Iteration =  5
Iteration =  6
Iteration =  7
Iteration =  8
Iteration =  9
Iteration =  10
Iteration =  11
Iteration =  12
Iteration =  13
Iteration =  14
Iteration =  15
Iteration =  16
Iteration =  17
Iteration =  18
Iteration =  19
Iteration =  20
Iteration =  21
Iteration =  22
Iteration =  23
Iteration =  24
Iteration =  25
Iteration =  26
Iteration =  27
Iteration =  28
Iteration =  29
Iteration =  30
Iteration =  31
Iteration =  32
Iteration =  33
Iteration =  34
Iteration =  35
Iteration =  36
Iteration =  37
Iteration =  38
Iteration =  39
Iteration =  40
Iteration =  41
Iteration =  42
Iteration =  43
Iteration =  44
Iteration =  45
Iteration =  46
Iteration =  47
Iteration =  48
Iteration =  49
Iteration =  50
Iteration =  51
Iteration =  52
Iteration =  53
Iteration =  54
Iteration =  55
Iteration =  56
Iteration =  57
Iteration =  58
Iteration =  59
Iteration =  60
Iteration =  61
Iteration =  62
Iteration =  63
I

In [None]:
generateNext()


 WNLioiannns sat ieurlnneee taste e htib.lll
  iftpleiee.dndeia hYfh,iearlns  . I  Wihaaddns  aev  faliaennnieee :drryien,serls. 
 


N

   TtfAfrrryiaa nsnaeie.  diaeordfn , etvsweeainnnnei emvsveyaed dstuoth h lomf f'frlteh  siaaonnnneoeind rooeasn thhhovua nfshoee.    btAillel  f raas.ets  tedtrn, y arbseait    atSiltrlaieedrslea yooaatnyd e, yciwn ehieaudr.d   .    A]GVaaoordnleeea lnreiaimsnn.ti JLtSothy    yhftore lwfwaaeortrrii  hhwissge,rdd    ohmwiaaa,lsn  iedpf ahtt e itnnn eeersdne ichhhhea efdas is.
  




   I  'thooh, ,eei  Denne iisnfsfcat.lc BNwSmaaasnnina,naoi htamse eaxtfaeae, nneoiesp groari 'sbsilyepf t allsnieied'nd e  hiass   tibth.rceoeurrl.
 
 YWIM aoolnvneie bcht eeewsnd e,




 Y Joooa t  dwah    qbLmouae ,  bftaslgneaeindnnoiionnnnaeeednlsse .    itShaeoes]  walSepuchryouuaveeee,ldr  iene veeeh  itnh-noiioonsn.inowwtwoieelrr,    stfAlren  t nhbmeooowddnt o,    tahWilie dx hfag  aem ntoauokndlh  emnnpees rgtizneki,e,    JaBI    wLttarrreeoe   effcveeeetrnt .e