<a href="https://colab.research.google.com/github/santule/ERA/blob/main/S17/GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torchinfo --quiet
!pip install transformers --quiet

In [2]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/AI/ERA_course/session17

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/AI/ERA_course/session17


In [13]:
from transformers import AutoTokenizer
import torch
import transformer
from torchinfo import summary
import numpy as np
from tqdm.auto import tqdm

In [19]:
n_iterations = 20
batch_size   = 16
seq_len      = 64 # also known as blocks in gpt
eval_iteration = 5
total_iterations_for_evaluation = 10
device       = "cuda" if torch.cuda.is_available() else "cpu"

# 1 - Load data and tokenize

In [5]:
data_pth = 'english_data/english.txt'
data_raw = open(data_pth, encoding ="utf-8").read()

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size
vocab_size

30522

In [6]:
tokens = tokenizer.tokenize(data_raw)
token_indices = tokenizer.convert_tokens_to_ids(tokens)
data_tokens = torch.tensor(token_indices, dtype = torch.long)
len(data_tokens)

Token indices sequence length is longer than the specified maximum sequence length for this model (37443 > 512). Running this sequence through the model will result in indexing errors


37443

In [7]:
data_tokens

tensor([ 5219,  1014,  1011,  ..., 12375,  2015,  1012])

In [8]:
n = int(0.9 * len(data_tokens))
train_data = data_tokens[:n]
val_data = data_tokens[n:]

# 2 - Load Model

In [20]:
my_gpt = transformer.Gpt(n_embeddings = vocab_size)
my_gpt.to(device)
optimizer = torch.optim.Adam(params = my_gpt.parameters())
summary(model= my_gpt, input_size=(32,64), dtypes = [torch.int32],col_names=["input_size","output_size","num_params","trainable"],
        col_width=20,
        row_settings=["var_names"])

Layer (type (var_name))                                      Input Shape          Output Shape         Param #              Trainable
Gpt (Gpt)                                                    [32, 64]             [32, 64, 30522]      49,152               True
├─Embedding (embeddings)                                     [32, 64]             [32, 64, 768]        23,440,896           True
├─Dropout (embedding_dropout)                                [32, 64, 768]        [32, 64, 768]        --                   --
├─Sequential (transformer_decoder)                           [32, 64, 768]        [32, 64, 768]        --                   True
│    └─TransformerDecoderBlock (0)                           [32, 64, 768]        [32, 64, 768]        --                   True
│    │    └─MultiHeadAttentionBlock_Decoder (msa_block)      [32, 64, 768]        [32, 64, 768]        2,360,064            True
│    │    └─MLPBlock (mlp_block)                             [32, 64, 768]        [32, 64, 768

# 3 - Train Model

In [21]:
def get_batch(data: list[str],seq_len:int, batch_size: int):
  ix = torch.randint(len(data) - seq_len, (batch_size,))
  x = torch.stack([data[i: i + seq_len] for i in ix])
  y = torch.stack([data[i+1 : i + seq_len + 1] for i in ix])
  return x,y

In [22]:
for it in tqdm(range(n_iterations)):
  my_gpt.train()
  x,y = get_batch(data = train_data,seq_len = seq_len, batch_size = batch_size)
  x = x.to(device)
  y = y.to(device)

  logits,loss   = my_gpt(x,y)
  train_loss += loss.item()

  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

  # evaluation
  if it % eval_iteration == 0:
      print("Evaluating the model")
      my_gpt.eval()
      losses = torch.zeros(total_iterations_for_evaluation)
      for k in range(total_iterations_for_evaluation):
          x,y = get_batch(data=val_data, seq_len = seq_len, batch_size=batch_size)
          x = x.to(device)
          y = y.to(device)
          logits, loss = my_gpt(x, y)
          losses[k] = loss.item()
      val_loss = losses.mean()

      losses = torch.zeros(total_iterations_for_evaluation)
      for k in range(total_iterations_for_evaluation):
          x,y = get_batch(data=train_data, seq_len = seq_len, batch_size=batch_size)
          x = x.to(device)
          y = y.to(device)
          logits, loss = my_gpt(x, y)
          losses[k] = loss.item()
      train_loss = losses.mean()
      print("step {:10} | train loss {:6.4f} | val loss {:6.4f}".format(it, train_loss, val_loss))

  0%|          | 0/20 [00:00<?, ?it/s]

Evaluating the model
step          0 | train loss 9.4621 | val loss 9.4609
Evaluating the model
step          5 | train loss 7.0127 | val loss 7.2627
Evaluating the model
step         10 | train loss 6.8966 | val loss 7.3043
Evaluating the model
step         15 | train loss 6.7066 | val loss 7.1931


My notes:
B = 16
T = 64

input batch (16,64)

token embeddings (16,64,768)

position embeddings (64, 768)